diff --git a/R/ADPRpages.R b/R/ADPRpages.R deleted file mode 100644 index 755ceaf6c..000000000 --- a/R/ADPRpages.R +++ /dev/null @@ -1,461 +0,0 @@ -#These functions each make a page for the ADPR. They assume a pdf with the following parameters for best formatting: -#pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24) - - -library(gplots) -library(ReadImages) - -##defaults<-par(no.readonly = TRUE) - - -tearsheet<-function(lanetable, sampletable, variant, Protocol, Sequencer){ - - #define layout - layout(matrix(c(1,1,2,4,3,5), ncol=2, nrow=3, byrow=TRUE), heights=c(1, 2.5,2.5,), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": TEAR SHEET", sep="") - drop<-read.jpeg("tearsheetdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) - - - #calc by lane stuff - sdlane<-rep("NA", 6) - meanlane<-sdlane - - attach(lanetable); - - callable.target<-HS_TARGET_TERRITORY[1]; - singlelanes<-length(which(Lane.Type=="Single")); - pairedlanes<-length(which(Lane.Type=="Paired")); - meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2); - sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2); - meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2); - sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2); - meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)); - sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)) - - names<-paste(Flowcell, "-", Lane, sep="") - - detach(lanetable) - - meansamp<-rep("NA", 6) - sdsamp<-meansamp - - #Calc by sample metrics - attach(bysample); - baits<-Bait.Set[1] - alllanes<-signif(sum(X..Lanes.included.in.aggregation, na.rm = TRUE)) - mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE)); - sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE)); - mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE)); - sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE)); - meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2); - sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2); - meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2); - sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2); - meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE)); - sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE)); - meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - - detach(bysample); - - #calc variant stuff - attach(variant) - SNPS<-c(ti_count[which(filter_name=="called")]+tv_count[which(filter_name=="called")]) - titvs<-c(ti.tv_ratio[which(filter_name=="called")]) - detach(variant) - - #prep stuff. - summary<-c(nrow(bysample), Protocol, baits, paste(callable.target, "bases")) - summary2<-c(Sequencer, alllanes, paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp)) - samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="") - lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="") - - #print out 4 tables in R - table1<-cbind(summary) - rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target") - par(mar=c(4,4,4,4)) - textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.75) - title(main="Project Summary", family="sans", cex.main=2) - - - table2<-cbind(lanes, samps) - colnames(table2)<-c("per lane", "per sample") - rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x") - par(mar=c(4,4,4,4)) - textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25) - title(main="Bases Summary", family="sans", cex.main=1.75) - - table3<-cbind(summary2) - rownames(table3)<-c("Sequencer", "Used lanes", "Used lanes per sample", "Lane pariteies", "Read legnths") - par(mar=c(4,4,4,4)) - textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25) - title(main="Sequencing Summary", family="sans", cex.main=1.75) - - - table4<-cbind(SNPS, titvs) - rownames(table4)<-c("All SNPs", "Known SNPs", "Novel SNPs") - colnames(table4)<-c("SNPs Found", "Ti/Tv") - textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25) - title(main="Variant Summary", family="sans", cex.main=1.75) - - } - -fingerprints<-function(lanetable, sample_sets){ - attach(lanetable) - - #define layout - layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": Fingerprint Status", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) - - #prep for FP plot - badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15)) - colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS))) - colors[badsnps]<-"Red" - ticks<-c(match(unique(Flowcell), Flowcell) ) - ys=rep(c(0, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, 0, 0), ceiling(length(ticks)/2)) - shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5 - if((length(ticks)%%2 > 0)){ - shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5 - } - shader<-na.omit(shader) - - #plot FP plot - par(mar=c(10, 6, 8, 3)) - plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by Flowcell", cex.main=2) - axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE) - axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2) - mtext("Lane",side=3, cex=.75, line=1.5) - mtext("Flowcell",side=1, cex=1.25, line=8) - polygon(shader, ys, border="black", lty=0, col="gray") - points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors) - points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors) - if(length(badsnps)>0){ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White") - mtext("Some problematic fingerprint sites", side=3) - }else{ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col="Blue", bg="White") - } - - #plot some summary of FP stuff - textplot("Some summary of Fingerprint problems will go here ", valign="top", family="sans") - - detach(lanetable) - } - -snps_called<-function(lanetable, sample_sets){ - attach(lanetable) - - #define layout for this page - layout(matrix(c(1,1,2, 3, 4,4), ncol=2, nrow=3, byrow=TRUE), widths = c(3,1), heights=c(1, 3,2), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": SNPs Called by Lane", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) - - #prep for snp plot - ticks<-c(match(unique(Flowcell), sort(Flowcell)) ) - ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE), min(SNP_TOTAL_SNPS, na.rm=TRUE)), ceiling(length(ticks)/2)) - shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5 - if((length(ticks)%%2 > 0)){ - shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5 - } - shader<-na.omit(shader) - cols<-rep("blue", length(SNP_TOTAL_SNPS)) - cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red" - - #plot snp plot - par(ylog=TRUE, mar=c(10, 6, 4, 0)) - plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="", - ylab="SNPs Called", - ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), - xaxt="n", - pch=NA) - title(main="SNPs Called in Each Lane sorted by Flowcell", line=3, cex=1.5) - axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE) - axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2) - mtext("Lane",side=3, cex=.75, line=1.5) - mtext("Flowcell",side=1, cex=1.25, line=8) - polygon(shader, ys, border="black", lty=0, col="gray") - points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19) - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){ - legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White") - } - - #plot boxplot - par(ylog=TRUE, mar=c(10, 0, 4, 2)) - boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="", yaxt="n", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), ylog=TRUE) - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){ - mtext("No outliers", side=1, line=4) - }else{ - mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) - } - - #Plot variant summary below - textplot("Variant Summary will go here", valign="top", family="sans") - - detach(lanetable) - } - -titvsamp<-function(metricsbysamp){ - attach(titv) - - #define layout - layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": Ti/Tv Ratio by Sample", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) - - #prep for titv graph - boxplot.stats(TiTvRatio[which(filter_name=="filtered")])$stats[5]->min - shade<-which(sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)0.3) #this can be changed to any kind of filter for particular lanes - colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5) - colors[crazies]<-rainbow(length(crazies)) - weights<-rep(1, ncol(errpercycle)) - weights[crazies]<-2 - - #plot erprp graph - par(mar=c(6, 6, 3, 2)) - matplot(errpercycle, - type="l", - lty="solid", - col=colors, - lwd=weights, - main="Error Rate per Read Position", - ylab="Error Rate", - xlab="Cycle/Read Position", - log="y", - cex.main=2, - cex.lab=1.5, - cex.axis=1.25, - ) - if(length(crazies)>0){ - legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5) - }else{ - mtext("No unusual lanes.", 1, line=6, cex=1.25) - } - - #Plot variant summary below - textplot("Something related will go here", valign="top", family="sans") - - } - -depth_target<-function(DOC){ - - #define layout - layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1, 5), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": Depth of Coverage By Target", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=1.75, col=gray(.25)) - - colnames(DOC)->cols - apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans - apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s - apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s - - par(ylog=FALSE, mar=c(5, 5, 4, 2)) - plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l",log="y",ylab="Coverage", xlab="",xaxt="n", main="Coverage Across All Targets", lwd=2, cex.main=2.5, cex.lab=1.5, cex.axis=1.25) - mtext("Targets sorted by median avereage coverage across sample", side=1, line=1, cex=1.5) - abline(h=10, lty="dashed", lwd=3) - lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)], col="dark blue") - lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)], col="dark blue") - legend(c(0, 20), legend="10x coverage", box.lty=0, lwd=3, lty="dashed") - legend("bottomleft", legend=c("Median average target coverage across all samples", "First and third quartiles of average target across all sample"), box.lty=0, lwd=c(1,2), col=c("black", "dark blue"), lty="solid") - - - #define layout - layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": Depth of Coverage For Poorly Covered Targets", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25)) - yuck<-DOC[which(medianofmeans<10),grep("mean", cols)] - yuck<-yuck+0.1 - par(mar=c(17, 4, 4, 2)) - boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", cex.lab=1.15, cex.axis=1.05, ylab="Average coverage accross all samples", main="Targets with low coverage accross samples") - - axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75) - mtext("Target", side=1, line=15, cex=1.5) - axis(1, at=c(1:length(which(medianofmeans<10))), labels=rownames(DOC[which(medianofmeans<10),])[order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=1.15) - } - -depth_sample<-function(DOC2){ - - #define layout - layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE) - - #prep for title bar - title=paste(sample_sets, ": Mean Depth of Coverage per Base by Sample", sep="") - drop<-read.jpeg("adprdrop.jpg") - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25)) - #prep for bysample - means<-c(sort(DOC2[which(DOC2[,2]<250),2]), rep(250, (length(which(DOC2[,2]>=250))-1))) - types<-rep(20, length(means)) - cols<-rep("black", length(means)) - types[which(means==250)]<-8 - cols[which(means==250)]<-"red" - - #plot doc by sample - - par(mar=c(10, 4, 4, 2)) - plot(means, ylim=c(0, 250), xaxt="n", col=cols, pch=types, xlab="", ylab="Depth of Coverage") -> axis(1, at=c(1:(nrow(DOC2)-1)), labels=c(rownames(DOC2[which(DOC2[,2]<250),])[order(DOC2[which(DOC2[,2]<250),2])], rownames(DOC2[which(DOC2[,2]>=250),])[order(which(DOC2[,2]>=250))][1:(length(which(DOC2[,2]>=250))-1)]), las=2) -> mtext("Samples", side=1, line=7, cex=1.25) - - - } - - - -datapuller<-function(setname){ - #library(yaml) - - strsplit(setname, ".")[1]->projectname - - - lanes<-read.delim(paste(projectname, "_lanes.txt", sep=""), header=TRUE) - samps<-read.delim(paste(projectname, "_samps.txt", sep=""), header=TRUE) - #doct<-read.delim(paste(setname, "depth.sample_interval_summary", sep=""), header=TRUE, row.names=1) - #docs<-read.delim(paste(setname, ".depth.sample_summary", sep=""), header=TRUE, row.names=1) - #eval<-read.csv(paste(setname, "eval.CountFunctionalClasses", sep=""), skip=1) - titv<-read.csv(paste(setname, ".eval.SimpleMetricsBySample.csv", sep=""), skip=1) - #erprp<-read.delim(paste(setname, ".erprp", sep="")) - - colnames(lanes)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE') - - files<-list(c(lanes, samps, doct, docs, eval, titv, erprp)) - - return(files) - } - - -runner<-function(basename, desc1, desc2){ - datapuller(basename)->tables - attach(tables) - - - - pdf(paste(basename, ".pdf", sep=""), width=22, height=15,pointsize=24) - - tearsheet(lanes, samps, titv, desc1, desc1) - fingerprints(lanes) - snps_called(lanes) - titvsamp(titv) - #functionalclasses(eval) - #errorratepercycle(erprp) - #depth_target(doct) - #depth_sample(docs) - - dev.off() - detach(tables) - } - -if(length(commandArgs(TRUE))>0){ - runner(commandArgs(TRUE)) - } - - - - diff --git a/R/Data.Processing.Report.r b/R/Data.Processing.Report.r deleted file mode 100644 index 7294560a4..000000000 --- a/R/Data.Processing.Report.r +++ /dev/null @@ -1,325 +0,0 @@ -#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively. -#set up database stuff for firehose and picard interface -#set up so runnable by firehsoe - -stuffmaker<-function(args){ - -lanes<-args[1] -samples<-args[2] -sample_sets<-args[3] -eval<-args[4] -titveval<-args[5] -DOCi<-args[6] -DOCs<-args[7] - -library(gplots) - -pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24) - - -if(is.na(sample_sets)){ - print("Please specify sample set for file naming and press enter.") - scan("stdin", what="character",n=1)->sample_sets - print("Thanks!") - } - - if(is.na(lanes) == FALSE && is.na(samples)==FALSE){ - #this makes a table & graphs using Picard data - - if(typeof(lanes)=="character"){ - read.delim(file=lanes, header= TRUE)->bylane; - colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE') - }else{ - lanes->bylane - colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE') - } - if(typeof(samples)=="character"){ - read.delim(file=samples, header= TRUE)->bysample; - }else{ - samples->bysample - } - - #Calc by lane metrics - sdlane<-rep("NA", 6) - meanlane<-sdlane - attach(bylane); - - callable.target<-HS_TARGET_TERRITORY[1]; - singlelanes<-length(which(Lane.Type=="Single")); - pairedlanes<-length(which(Lane.Type=="Paired")); - meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2); - sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2); - meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2); - sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2); - meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)); - sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)) - - names<-paste(Flowcell, "-", Lane, sep="") - - #makes a plot of the number of SNPS called per lane - - - ticks<-c(match(unique(Flowcell), sort(Flowcell)) ) - ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96), ceiling(length(ticks)/2)) - - defaults<-par(no.readonly = TRUE) - - layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) - par(mar=c(10, 6, 4, 8)) - plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="", ylab="SNPs Called in Lane", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), xaxt="n", pch=NA) - title(main=paste(sample_sets, ": SNPs Called in Each Lane sorted by Flowcell", sep=""), line=3, cex=1.25) - axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE) - axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2) - mtext("Lane",side=3, cex=.75, line=1.5) - mtext("Flowcell",cex=.75,side=1, line=8) - - shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5 - if((length(ticks)%%2 > 0)){ - shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5 - } - shader<-na.omit(shader) - polygon(shader, ys, border="black", lty=0, col="gray") - cols<-rep("blue", length(SNP_TOTAL_SNPS)) - cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red" - points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19) - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){ - legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White") - } - - boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called" ) - - - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){ - mtext("No outliers", side=1, line=4) - }else{ - mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) - } - - - - - - #makes a plot of fingerprint calls and labels them good or bad - par(defaults) - - - badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15)) - - colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS))) - colors[badsnps]<-"Red" - ticks<-c(match(unique(Flowcell), Flowcell) ) - ys=rep(c(0, 24*1.04, 24*1.04, 0, 0), ceiling(length(ticks)/2)) - #pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS)) - par(mar=c(10, 6, 8, 3)) - plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by lane") - axis(side=1, at=(ticks+1), labels=unique(Flowcell), tick=FALSE, hadj=1, las=2) - shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5 - shader<-na.omit(shader) - if((length(ticks)%%2 > 0)){ - shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5 - } - - polygon(shader, ys, border="black", lty=0, col="gray") - points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors) - - points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors) - - - - - if(length(badsnps)>0){ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White") - mtext("Some problematic fingerprint sites", side=3) - }else{ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col=c("Blue", "Blue", "Black"), bg="White") - } - - - detach(bylane) - - - }else{ - print("Lane and Sample metrics file paths not provided") - } - meansamp<-rep("NA", 6) - sdsamp<-meansamp - - #Calc by sample metrics - attach(bysample); - mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE)); - sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE)); - mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE)); - sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE)); - meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2); - sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2); - meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2); - sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2); - meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE)); - sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE)); - meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - - detach(bysample); - - #print all of this stuff out in R. - summary<-c(paste(callable.target, "bases"), paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp)) - - samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="") - - lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="") - - layout(matrix(c(1,2), ncol=1), heights=c(2,3)) - - table1<-cbind(summary) - rownames(table1)<-c("Callable Target", "Used Lanes per Sample", "Parities", "Read Length") - textplot(table1, col.rownames="blue", show.colnames=FALSE, cex=1.75) - title(main="Sequencing Summary", family="serif", cex.main=2) - table2<-cbind(lanes, samps) - colnames(table2)<-c("per lane", "per sample") - rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x") - textplot(table2, rmar=1, col.rownames="blue", cex=1.25) - title(main="Bases Summary", family="serif", cex.main=1.75) - - - par(defaults) - - #Makes Error Rate percycle graph - if(is.na(eval)==FALSE){ - if(typeof(eval)=="character"){ - read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle - }else{ - eval->errpercycle - } - - - #pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5) - - crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes - - colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5) - colors[crazies]<-rainbow(length(crazies)) - weights<-rep(1, ncol(errpercycle)) - weights[crazies]<-2 - - matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7)) - - if(length(crazies)>0){ - legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5) - }else{ - legend("topleft", legend="No unusual lanes.", bty="n") - } - - - - }else{ - print("Error Rate Per Cycle file paths not provided") - } - - #Makes TI/TV known v novel graph - if(is.na(titveval)==FALSE){ - ##TODO: need ot make sure this is nice and prettified. - titv<-read.csv(file=titveval, skip=1) - attach(titv) - - #pdf(file=paste(sample_sets, "_TI-TV.pdf", sep=""), width=0.2*length(unique(sample)), height=0.175*length(unique(sample))) - par(mar=c(11, 4, 4, 2)) - plot(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="novel" & filter_name=="called")], xaxt="n", ylim=c(1, 4), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", col="red", pch=1) - - points(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="known" & filter_name=="called")], pch=1, col="blue") - - axis(side=1, at=(1:length(unique(sample))), labels=unique(sample), tick=FALSE, hadj=1, las=2) - - abline(a=mean(Ti.Tv[which(novelty_name=="all" & filter_name=="called")]),b=0) - - legend("bottomright", legend=c("Known Variants", "Novel Variants", "Mean Ti/Tv for all variants"), col=c("blue", "red", "black"), pch=c(1,1,NA_integer_), lty=c(0, 0, 1), xjust=0.5) - mtext(line=9,"Lower Ti/Tv ratios indicate potentially increased false positive SNP rates.", side=1) - - - }else{ - print("TiTV filepath not provided") - } - - #Make DOC graph - if(is.na(DOCi)==FALSE){ - #pdf(paste(sample_set, "_DOCi.pdf", sep=""), width=6, height=5) - if(typeof(DOCi)=="character"){ - as.data.frame(read.delim(DOCi))->DOC - }else{ - DOCi->DOCdata - } - - colnames(DOC)->cols - apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans - apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s - apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s - - par(ylog=FALSE, mar=c(5, 4, 4, 2)) - plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l", lwd="1",log="y",ylab="Coverage", xlab="Targets sorted by median average coverage across sample",xaxt="n", main="Coverage Across All Targets") - - abline(h=10, lty="dotted") - - lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)]) - - lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)]) - - legend("bottomleft", "10x coverage", box.lty=0, lty="dotted") - - - - #pdf(paste(sample_set, "_DOCiy.pdf", sep=""), width=6, height=5) - yuck<-DOC[which(medianofmeans<10),grep("mean", cols)] - yuck<-yuck+0.1 - par(mar=c(16, 4, 4, 2)) - boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", ylab="Average coverage accross all samples", main="Targets with low coverage accross samples") - - axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75) - mtext("Target", side=1, line=14) - axis(1, at=c(1:length(which(medianofmeans<10))), labels=DOC[which(medianofmeans<10),1][order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=0.75) - - - - - }else{ - print("Depth of Coverage--intervals filepath not provided") - } - - if(is.na(DOCs)==FALSE){ - #pdf(paste(sample_set, "_DOCs.pdf", sep=""), width=6, height=5) - if(typeof(DOCs)=="character"){ - as.data.frame(read.delim(DOCs))->DOC2 - }else{ - DOCs->DOCdata - } - par(mar=c(10, 4, 4, 2)) - boxplot(t(DOC2[,2:ncol(DOC2)]+0.1), log="y", main="Depth of Coverage by Sample", xaxt="n", yaxt="n", ylab="Coverage") - - axis(1, at=c(1:nrow(DOC2)), labels=DOC2[,1], las=2) - - axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))]))) - - labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))])) - - mtext("Samples", side=1, line=9) - - - - }else{ - print("Depth of Coverage--samples filepath not provided") - } - - dev.off() - -} -if(length(commandArgs(TRUE))>0){ - stuffmaker(commandArgs(TRUE)) - } diff --git a/R/DataProcessingReport/GetTearsheetStats.R b/R/DataProcessingReport/GetTearsheetStats.R deleted file mode 100644 index 032ff9f84..000000000 --- a/R/DataProcessingReport/GetTearsheetStats.R +++ /dev/null @@ -1,366 +0,0 @@ -##put titles/rownames left -##make titles blue -##decrease margins below titles -## put row names in black -##put background rows in. -##change layouts so that it looks better -##get sample numbers in correctly - -.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/') - -suppressMessages(library(gplots)); -suppressMessages(library(ReadImages)); - -suppressMessages(library(gsalib)); -suppressMessages(library(ROracle)); - -cmdargs = gsa.getargs( - list( - yaml = list(value=NA, doc="pipeline YAML file"), - bamlist = list(value=NA, doc="list of BAM files"), - evalroot = list(value=NA, doc="VariantEval file"), - tearout = list(value=NA, doc="Output path for tearsheet PDF")#, - plotout = list(value=NA, doc="Output path for PDF") - ), - doc="Creates a tearsheet" -); - - -bamlist = scan(cmdargs$bamlist, "character"); -squids <- system(paste("grep SQUID ", cmdargs$yaml, ' |grep "C..." -o', sep=""), intern=TRUE) -indexed = c(); -nonindexed = c(); -for (bam in bamlist) { - bamheader = system(paste("samtools view -H", bam), intern=TRUE); - - - if (length(bamheader) > 0) { - rgs = bamheader[grep("^@RG", bamheader)]; - - for (rg in rgs) { - id = grep("PU:", unlist(strsplit(rg, "\t")), value=TRUE); - id = sub("PU:", "", id); - id = gsub("XX......", "XX", id) - if (length(unlist(strsplit(id, "\\.")))==3){ - indexed<-c(indexed, id) - } - else{ - if(length(unlist(strsplit(id, "\\.")))==2){ - nonindexed<-c(nonindexed, id) - } - else{ - print(id + " is a strange PU and will result in odd searches") - } - } - } - } else { - print(sprintf("Could not load '%s'\n", bam)); - } -} - -drv = dbDriver("Oracle"); -con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD"); - -rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS")); -d = fetch(rs, n=-1); -dbHasCompleted(rs); -dbClearResult(rs); - -rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG")); -d2 = fetch(rs2, n=-1); -dbHasCompleted(rs2); -dbClearResult(rs2); - -oraCloseDriver(drv); - -squid_fclanes = sprintf("%s.%s", d$"Flowcell", d$"Lane"); -squid_fclanes_indexed = sprintf("%s.%s.%s", d$"Flowcell", d$"Lane", d$"Barcode"); - - -dproj = d[which(squid_fclanes %in% nonindexed),]; -dproj = rbind(dproj, d[which(squid_fclanes_indexed %in% indexed),]) - -dproj = dproj[which(dproj$"Project" %in% unique(squids)),] - -d2proj = d2[which(d2$"Project" %in% unique(dproj$Project) & d2$"Sample" %in% dproj$"External ID"),]; - - - -tearsheet<-function(){ - tearsheetdrop <- "~Documents/Sting/R/gsalib/data/tearsheetdrop.jpg" #put the path to the tearsheet backdrop here - - pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24) - - #define layout - postable<-matrix(c(1, 1, 1, 1, 1, 1, rep(c(2, 2, 2, 4, 4, 4), 5), rep(c(3, 3, 3, 4, 4, 4), 3), rep(c(3,3,3,5,5,5), 5), 6,6,6,7,7,7), nrow=15, ncol=6, byrow=TRUE) - layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE) - - - #prep for title bar - drop<-read.jpeg(system.file(tearsheetdrop, package="gsalib")) - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(155, 50, "testing", family="serif", adj=c(0,0), cex=3, col=gray(.25)) - - - # Project summary - projects = paste(unique(dproj$"Project"), collapse=", "); - - used_samples = length(bamlist); - - unused_samples = 0; - - sequencing_protocol = "Hybrid selection"; #can this be extracted? - - bait_design = paste(dimnames(table(dproj$"Bait Set"))[[1]][order(table(dproj$"Bait Set"), decreasing=TRUE)], collapse=", "); - - if(nchar(bait_design)>50){ - bait_design<-strsplit(bait_design, ", ")[[1]][1] - } - - if(nchar(bait_design)>50){ - bait_design<-strsplit(bait_design, ".Homo")[[1]][1] - } - - callable_target = paste(na.omit(unique(dproj$"Target Territory")), collapse=", "); - - table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target) - rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target") - par(mar=c(0,0,1,0)) - textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top") - title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1) - - # Bases summary - - reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE); - reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE); - lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd) - - used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE); - used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE); - lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd)); - - target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage")); - target_coverage_sd = sd(na.omit(dproj$"Mean Target Coverage")); - lanes<-c(lanes, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); - - pct_loci_gt_10x_mean = mean(na.omit(dproj$"Target Bases 10x %")); - pct_loci_gt_10x_sd = sd(na.omit(dproj$"Target Bases 10x %")); - lanes<-c(lanes, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); - - pct_loci_gt_20x_mean = mean(na.omit(dproj$"Target Bases 20x %")); - pct_loci_gt_20x_sd = sd(na.omit(dproj$"Target Bases 20x %")); - lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); - - pct_loci_gt_30x_mean = mean(na.omit(dproj$"Target Bases 30x %")); - pct_loci_gt_30x_sd = sd(na.omit(dproj$"Target Bases 30x %")); - lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); - - - reads_per_sample_mean = format(mean(d2proj$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE); - reads_per_sample_sd = format(sd(d2proj$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE); - samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd); - - used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE); - used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE); - samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd)); - - target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage")); - target_coverage_sd = sd(na.omit(d2proj$"Mean Target Coverage")); - samps<-c(samps, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); - - pct_loci_gt_10x_mean = mean(na.omit(d2proj$"Target Bases 10x %")); - pct_loci_gt_10x_sd = sd(na.omit(d2proj$"Target Bases 10x %")); - samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); - - pct_loci_gt_20x_mean = mean(na.omit(d2proj$"Target Bases 20x %")); - pct_loci_gt_20x_sd = sd(na.omit(d2proj$"Target Bases 20x %")); - samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); - - pct_loci_gt_30x_mean = mean(na.omit(d2proj$"Target Bases 30x %")); - pct_loci_gt_30x_sd = sd(na.omit(d2proj$"Target Bases 30x %")); - samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); - - table2<-cbind(lanes, samps) - colnames(table2)<-c("Per lane", "Per sample") - - rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x") - par(mar=c(0,0,1,0)) - textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top") - title(main="Bases Summary", family="sans", cex.main=1.25, line=0) - - -# Sequencing summary - - instrument <- c(); - if(length(grep("AAXX", dproj$Flowcell))>0){ - instrument <- c(instrument, "Illumina GA2") - } - if(length(grep("ABXX", dproj$Flowcell))>0){ - instrument <- c(instrument, "Illumina HiSeq") - } - - if(length(instrument)>1){ - instrument<-paste(instrument[1], instrument[2], sep=" and ") - } - - used_lanes = nrow(dproj); - unused_lanes_by_sequencing = 0; #can we get this? - unused_lanes_by_analysis = 0; - - - lanes_per_sample_mean = mean(table(dproj$"External ID"), na.rm=TRUE); - lanes_per_sample_sd = sd(table(dproj$"External ID"), na.rm=TRUE); - lanes_per_sample_median = median(table(dproj$"External ID")); - lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired")); - lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed")); - lanes_single = nrow(subset(dproj, dproj$"Lane Type" == "Single")); - - read_length_mean = mean(dproj$"Mean Read Length (P)"); - read_length_sd = sd(dproj$"Mean Read Length (P)"); - read_length_median = median(dproj$"Mean Read Length (P)"); - - date = dproj$"Run Date"; -# date = sub("JAN", "01", date); -# date = sub("FEB", "02", date); -# date = sub("MAR", "03", date); -# date = sub("APR", "04", date); -# date = sub("MAY", "05", date); -# date = sub("JUN", "06", date); -# date = sub("JUL", "07", date); -# date = sub("AUG", "08", date); -# date = sub("SEP", "09", date); -# date = sub("OCT", "10", date); -# date = sub("NOV", "11", date); -# date = sub("DEC", "12", date); - date = date[order(as.Date(date, format="%d-%m-%Y"))]; - - start_date = date[1]; - end_date = date[length(date)]; - - - table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date)) - - - rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates") - par(mar=c(0,0,1,0)) - textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top") - title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0) - -eval = gsa.read.gatkreport(cmdargs$evalroot) - - -# Variant summary -##TODO: Fix this csv reader - eval.counts = eval$CountVariants - eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci; - eval.counts.known = subset(eval.counts, Novelty == "known")$nVariantLoci; - eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci; - - eval.titv = eval$TiTvVariantEvaluator - eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio; - eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio; - eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio; - - table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3); - - rownames(table4) = c("All", "Known", "Novel"); - colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio"); - - - - par(mar=c(0,0,0,0)) - textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top") - title(main="Variant Summary", family="sans", cex.main=1.25, line=-2) -# -# #plots -# #fix this reader -# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#"); -# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all"); -# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known"); -# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel"); - - eval.ac = eval$SimpleMetricsByAC.metrics - eval.ac.all = subset(eval.ac, Novelty == "all"); - eval.ac.known = subset(eval.ac, Novelty == "known"); - eval.ac.novel = subset(eval.ac, Novelty == "novel"); -# -# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#"); -# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -# eval.func.all = subset(eval.func.called, novelty_name == "all"); -# eval.func.known = subset(eval.func.called, novelty_name == "known"); -# eval.func.novel = subset(eval.func.called, novelty_name == "novel"); - - - #boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3); - -# par(mar=c(5, 4, 4, 2) + 0.1) -# ind = order(eval.bysample.all$CountVariants); -# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants))); -# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3); -# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3); -# legend("right", max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21); - - par(mar=c(5, 4, 4, 2) + 0.1) - plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count", ylab="Number of variants", main="Variants by Allele Count", log="xy", bty="n"); - points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2); - points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2); - legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2); - - #plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black"); - #points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue"); - #points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red"); - #legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21); - - - - dev.off() - } - -tearsheet() - -# Plots -plots<-function(){ -# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#"); -# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all"); -# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known"); -# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel"); - - - eval.ac = eval$SimpleMetricsByAC.metrics - eval.ac.all = subset(eval.ac.called, Novelty == "all"); - eval.ac.known = subset(eval.ac.called, Novelty == "known"); - eval.ac.novel = subset(eval.ac.called, Novelty == "novel"); -# -# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#"); -# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -# eval.func.all = subset(eval.func.called, novelty_name == "all"); -# eval.func.known = subset(eval.func.called, novelty_name == "known"); -# eval.func.novel = subset(eval.func.called, novelty_name == "novel"); - - pdf(file= cmdargs$plotout, width=22, height=17, pagecentre=TRUE, pointsize=24) -# -# boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3); -# -# ind = order(eval.bysample.all$CountVariants); -# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants))); -# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3); -# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3); -# legend(0, max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21); - -plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Allele count", ylab="Number of variants", main="", log="xy", bty="n"); -points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2); -points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2); -legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2); -# -# plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black"); -# points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue"); -# points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red"); -# legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21); - -dev.off(); -} diff --git a/R/DataProcessingReport/Tearsheet.R b/R/DataProcessingReport/Tearsheet.R deleted file mode 100644 index 10cf434f3..000000000 --- a/R/DataProcessingReport/Tearsheet.R +++ /dev/null @@ -1,266 +0,0 @@ -#New tearsheet generator -.libPaths('/humgen/gsa-pipeline/.repository/R/') - -suppressMessages(library(gplots)); -suppressMessages(library(ReadImages)); -suppressMessages(library(gsalib)); - -tearsheet<-function(){ - -def.par <- par(no.readonly = TRUE) - - #define layout - postable<-matrix(c(1, 1, 1, 1, rep(c(2, 2, 4, 4), 5), rep(c(3, 3, 4, 4), 3), rep(c(3,3,5,5), 5), 6,7,8,9), nrow=15, ncol=4, byrow=TRUE) - layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE) - - #prep for title bar - drop<-read.jpeg(system.file("data", "tearsheetdrop.jpg", package="gsalib")) - - - #plot title bar - par(mar=c(0,0,0,0)) - plot(drop) - text(155, 50, cmdargs$title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) - print("Title created...") - - - # Project summary - projects = paste(squids, collapse=", "); - - used_samples = nrow(settable); - - unused_samples = 0; - - sequencing_protocol = samp$Initiative[1] - - bait_design = samp$"Bait Set"[1] - - callable_target = samp$"Target Territory"[1] - - table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target) - rownames(table1)<-c("Samples","Sequencing Initiative", "Bait Design","Callable Target") - par(mar=c(0,0,1,0)) - textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top") - title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1) - print("Project summary created...") - # Bases summary - - reads_per_lane_mean = format(mean(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE); - reads_per_lane_sd = format(sd(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE); - lanessum<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd) - - used_bases_per_lane_mean = format(mean(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE); - used_bases_per_lane_sd = format(sd(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE); - lanessum<-c(lanessum, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd)); - - target_coverage_mean = mean(na.omit(lane$"Mean Target Coverage")); - target_coverage_sd = sd(na.omit(lane$"Mean Target Coverage")); - lanessum<-c(lanessum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); - - pct_loci_gt_10x_mean = mean(na.omit(lane$"Target Bases 10x %")); - pct_loci_gt_10x_sd = sd(na.omit(lane$"Target Bases 10x %")); - lanessum<-c(lanessum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); - - pct_loci_gt_20x_mean = mean(na.omit(lane$"Target Bases 20x %")); - pct_loci_gt_20x_sd = sd(na.omit(lane$"Target Bases 20x %")); - lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); - - pct_loci_gt_30x_mean = mean(na.omit(lane$"Target Bases 30x %")); - pct_loci_gt_30x_sd = sd(na.omit(lane$"Target Bases 30x %")); - lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); - - - reads_per_sample_mean = format(mean(samp$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE); - reads_per_sample_sd = format(sd(samp$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE); - sampssum<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd); - - used_bases_per_sample_mean = format(mean(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE); - used_bases_per_sample_sd = format(sd(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE); - sampssum<-c(sampssum, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd)); - - target_coverage_mean = mean(na.omit(samp$"Mean Target Coverage")); - target_coverage_sd = sd(na.omit(samp$"Mean Target Coverage")); - sampssum<-c(sampssum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); - - pct_loci_gt_10x_mean = mean(na.omit(samp$"Target Bases 10x %")); - pct_loci_gt_10x_sd = sd(na.omit(samp$"Target Bases 10x %")); - sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); - - pct_loci_gt_20x_mean = mean(na.omit(samp$"Target Bases 20x %")); - pct_loci_gt_20x_sd = sd(na.omit(samp$"Target Bases 20x %")); - sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); - - pct_loci_gt_30x_mean = mean(na.omit(samp$"Target Bases 30x %")); - pct_loci_gt_30x_sd = sd(na.omit(samp$"Target Bases 30x %")); - sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); - - table2<-cbind(lanessum, sampssum) - used_lanes = length(unique(paste(lane$Flowcell, lane$Lane))); - if(nrow(lane)>used_lanes){ - colnames(table2)<-c("Per barcoded readgroup", "Per sample") - } - else{ - colnames(table2)<-c("Per lane", "Per sample") - } - rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x") - par(mar=c(0,0,1,0)) - textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top") - title(main="Bases Summary", family="sans", cex.main=1.25, line=0) - - print("Bases summary created...") - -# Sequencing summary - - instrument <- c(); - if(length(grep("AAXX", lane$Flowcell))>0){ - instrument <- c(instrument, "Illumina GA2") - } - if(length(grep("ABXX", lane$Flowcell))>0){ - instrument <- c(instrument, "Illumina HiSeq") - } - - if(length(instrument)>1){ - instrument<-paste(instrument[1], instrument[2], sep=" and ") - } - - used_lanes = length(unique(paste(lane$Flowcell, lane$Lane))); - unused_lanes_by_sequencing = 0; #can we get this? - unused_lanes_by_analysis = 0; - - lanes_per_sample_mean = mean(table(lane$"External ID"), na.rm=TRUE); - lanes_per_sample_sd = sd(table(lane$"External ID"), na.rm=TRUE); - lanes_per_sample_median = median(table(lane$"External ID")); - lanes_paired = length(unique(paste(subset(lane, lane$"Lane Type" == "Paired")$Flowcell, subset(lane, lane$"Lane Type" == "Paired")$Lane))); - lanes_widowed = length(unique(paste(subset(lane, lane$"Lane Type" == "Widowed")$Flowcell, subset(lane, lane$"Lane Type" == "Widowed")$Lane))); - lanes_single = length(unique(paste(subset(lane, lane$"Lane Type" == "Single")$Flowcell, subset(lane, lane$"Lane Type" == "Single")$Lane))); - - read_length_mean = mean(lane$"Mean Read Length (P)"); - read_length_sd = sd(lane$"Mean Read Length (P)"); - read_length_median = median(lane$"Mean Read Length (P)"); - - - date = sort(as.Date(lane$"Run Date", format="%d-%b-%y")); - - start_date = format(date[1], "%B %d, %Y"); - end_date = format(date[length(date)], "%B %d, %Y"); - -if(nrow(lane)>used_lanes){ - used_lanes<-paste(used_lanes, " (multiplexed; ", nrow(lane), " total barcoded readgroups)", sep="") - } - table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date)) - - rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates") - par(mar=c(0,0,1,0)) - textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top") - title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0) - - print("Sequencing summary created...") - -# Variant summary - - eval.counts = basiceval$CountVariants - if("FunctionalClass" %in% colnames(eval.counts)){ - eval.counts= subset(eval.counts, FunctionalClass == "all") - } - if("Sample" %in% colnames(eval.counts)){ - eval.counts= subset(eval.counts, Sample == "all") - } - if("Filter" %in% colnames(eval.counts)){ - eval.counts= subset(eval.counts, Filter == "called") - } - eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci; - eval.counts.known = subset(eval.counts,Novelty == "known")$nVariantLoci; - eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci; - - eval.titv = basiceval$TiTvVariantEvaluator - if("FunctionalClass" %in% colnames(eval.titv)){ - eval.titv= subset(eval.titv, FunctionalClass == "all") - } - if("Sample" %in% colnames(eval.titv)){ - eval.titv= subset(eval.titv, Sample == "all") - } - if("Filter" %in% colnames(eval.titv)){ - eval.titv= subset(eval.titv, Filter == "called") - } - eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio; - eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio; - eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio; - - table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3); - - rownames(table4) = c("All", "Known", "Novel"); - colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio"); - - print("Variant summary created...") - - par(mar=c(0,0,0,0)) - textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top") - title(main="Variant Summary", family="sans", cex.main=1.25, line=-2) - - eval.bysample = SAeval$CountVariants - eval.bysample.all = subset(eval.bysample, Novelty == "all" & Sample != "all"); - eval.bysample.known = subset(eval.bysample, Novelty == "known"& Sample != "all"); - eval.bysample.novel = subset(eval.bysample, Novelty == "novel"& Sample != "all"); - - eval.bysampleTITV = SAeval$TiTvVariantEvaluator - eval.bysampleTITV.all = subset(eval.bysampleTITV, Novelty == "all" & Sample != "all"); - eval.bysampleTITV.known = subset(eval.bysampleTITV, Novelty == "known"& Sample != "all"); - eval.bysampleTITV.novel = subset(eval.bysampleTITV, Novelty == "novel"& Sample != "all"); - - - eval.ac = basiceval$SimpleMetricsByAC.metrics - if("FunctionalClass" %in% colnames(eval.titv)){ - eval.ac= subset(eval.ac, FunctionalClass == "all") - } - if("Sample" %in% colnames(eval.titv)){ - eval.ac= subset(eval.ac, Sample == "all") - } - if("Filter" %in% colnames(eval.titv)){ - eval.ac= subset(eval.ac, Filter == "called") - } - - eval.ac.all = subset(eval.ac, Novelty == "all"); - eval.ac.known = subset(eval.ac, Novelty == "known"); - eval.ac.novel = subset(eval.ac, Novelty == "novel"); - - eval.func = FCeval$CountVariants - - par(mar=c(5, 5, 4, 2) + 0.1) - - - boxplot(eval.bysampleTITV.all$tiTvRatio, eval.bysampleTITV.known$tiTvRatio, eval.bysampleTITV.novel$tiTvRatio, main="Ti/Tv by Sample", col=c("dark gray", "blue", "red"), names=c("All", "Known", "Novel"), ylab="Ti/Tv per sample", main="",cex=1.3, cex.lab=1.3, cex.axis=1.3); - - par(mar=c(7, 5, 4, 2) + 0.1) - ind = order(eval.bysample.all$nVariantLoci); - plot(eval.bysample.all$nVariantLoci[ind], xlab="",pch=16, col="black", xaxt="n", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", ylab="Number of variants\n(axis in log space)", bty="n", log="y",ylim=c(1, max(eval.bysample.all$nVariantLoci))); - points(eval.bysample.known$nVariantLoci[ind], pch=16, col="blue", cex=1.3); - points(eval.bysample.novel$nVariantLoci[ind], pch=16,col="red", cex=1.3); - legend("bottomleft", max(eval.bysample.all$nVariantLoci)/2, c("All", "Known", "Novel"), , col=c("black", "blue", "red"), pt.cex=1.3, pch=16); - if(nrow(samp)<25){ - axis(1, at=c(1:length(eval.bysample.all$Sample[ind])), lab=eval.bysample.all$Sample[ind], cex=.7, las=2 ) - }else{ - axis(1, at=c(1:nrow(samp)), lab=rep("", nrow(samp)), cex=0.1, las=2, lwd.ticks=0) - title(xlab="Sample\n(too many individuals to label)") - } - - par(mar=c(6, 5, 4, 2) + 0.1) - plot(sort(eval.ac.all$AC), eval.ac.all$n[order(eval.ac.all$AC)], ylim=c(1, max(eval.ac$n)), col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count\n(axis in log space)", ylab="Number of variants\n(axis in log space)", main="Variants by Allele Count", log="xy", bty="n"); - points(sort(eval.ac.known$AC), eval.ac.known$n[order(eval.ac.known$AC)], col="blue", type="l", lwd=2); - points(sort(eval.ac.novel$AC), eval.ac.novel$n[order(eval.ac.novel$AC)], col="red", type="l", lwd=2); - if(nrow(samp)<25){ - legend("bottomleft", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2); - }else{ - legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2); - } - par(mar=c(5, 5, 4, 2) + 0.1) - - barplot(eval.func$nVariantLoci[4:nrow(eval.func)], col=c("dark gray", "blue", "red"), space=c(.2,0,0), log="y", main="Variants by Functional Class", xlab="Functional Class", ylab="Number of variants\n(axis in log space)") - axis(1, at=c(1.5,5,8.5), lab=c("Missense", "Nonsense", "Silent"), cex=.5, tick=FALSE) - legend("top", c("All", "Known", "Novel"), fill=c("dark gray", "blue", "red"), cex=.7); - - print("Graphs created...") - - print("All done!") - par(def.par)#- reset to default - } - diff --git a/R/DataProcessingReport/newpostqc.r b/R/DataProcessingReport/newpostqc.r deleted file mode 100644 index 6544aa06f..000000000 --- a/R/DataProcessingReport/newpostqc.r +++ /dev/null @@ -1,41 +0,0 @@ -source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/qcplots.r") -suppressMessages(library(gplots)); -def.par <- par(no.readonly = TRUE) - - -cmdargs = gsa.getargs( - list( - tsv = list(value=NA, doc="pipeline tsv file"), - evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"), - reportout = list(value=NA, doc="Output path for report PDF")#, - ), - doc="Creates a variant report" -); - -read.delim(cmdargs$tsv, header=FALSE)->settable - -squids<-unique(settable[,1]) - - -gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval -gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval -print("Evals read") - -pdf(file= cmdargs$reportout, width=22, height=17, pagecentre=TRUE, pointsize=24) - print("PDF created...") - - -path="." -weirdos<-which(SAeval$TiTvVariantEvaluator$Sample %in% SAeval$TiTvVariantEvaluator$Sample[which(SAeval$TiTvVariantEvaluator$tiTvRatio <2)]) - -novelAC(SAeval) -knownAC(SAeval) -AllAC(SAeval) -layout(matrix(c(6,1, 2,3, 4, 5), nrow=6), heights=c(1, 1, 1, 1, 1,1)) -textplot("Sample Novel TiTv ranges should be above 2, as they are in previous datasets. \nSamples with lower TiTv data are flagged in subsequent plots with hot pink labels, and listed below:") -textplot(paste(unique(SAeval$TiTvVariantEvaluator$Sample[weirdos]), collapse=", "), halign="left") -textplot("Problem Samples frequently have unusually high or low numbers of variants.") -textplot("Samples with unusually high numbers of novel variants may be from different populations, and, as such, should have higher heterozygosity. \nIf this is not the case, there may be problems with the samples.") -textplot("Unusually high numbers of variants with low allele counts may indicate variants generated from problematic samples.") -textplot("Notes for interpreting QC data:") -dev.off() diff --git a/R/DataProcessingReport/prelimqc.r b/R/DataProcessingReport/prelimqc.r deleted file mode 100644 index 5c3fc2ecb..000000000 --- a/R/DataProcessingReport/prelimqc.r +++ /dev/null @@ -1,177 +0,0 @@ -#preqc.r -library(gplots) -.libPaths('/humgen/gsa-pipeline/.repository/R/') -library(gsalib) - -cmdargs = gsa.getargs( - list( - tsv = list(value=NA, doc="pipeline tsv file"), - qcout=list(value=NA, doc="path to output root") - ), - doc="Creates a tearsheet" -); - -read.delim(cmdargs$tsv, header=FALSE)->settable - - squids<-unique(settable[,1]) -print(paste(nrow(settable), "samples in tsv")) -lane<-data.frame() -samp<-data.frame() -for(squid in squids){ - gsa.read.squidmetrics(squid, TRUE)->lanemetrics - print(paste("Got lane metrics for", squid)) - addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),] - gsa.read.squidmetrics(squid, FALSE)->samplemetrics - print(paste("Got sample metrics for", squid)) - addsamps<-samplemetrics[which(samplemetrics$Sample %in% settable[,2]),] - lane<-rbind(lane, addlanes) - samp<-rbind(samp, addsamps) -} - -print(paste(nrow(samp), "samples in samp")) -print(paste(length(unique(lane$"External ID")), "samples in lane")) - -print(paste(setdiff(settable[,2], samp$Sample), "do not overlap between samp and tsv")) -print(paste(setdiff(settable[,2], lane$"External ID"), "do not overlap between lane and tsv")) -print(paste(setdiff(samp$Sample, lane$"External ID"), "do not overlap between lane and samp")) - -missingSamp<-setdiff(settable[,2], samp$Sample) -missingLane<-setdiff(settable[,2], lane$"External ID") - -drv = dbDriver("Oracle"); -con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD"); - -rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS")); -d = fetch(rs, n=-1); -dbHasCompleted(rs); -dbClearResult(rs); - -rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG")); -d2 = fetch(rs2, n=-1); -dbHasCompleted(rs2); -dbClearResult(rs2); - -oraCloseDriver(drv); - - compsamp=d2[which(d2$"Bait Set" %in% samp$"Bait Set"),] - complane=d[which(d$"Bait Set" %in% lane$"Bait Set"),] - - - -pdf(paste(cmdargs$qcout, "pdf", sep="."), width=11, height=8.5) - -plot(samp$"Target Bases 20x %", main="Coverage to 20x", ylab="% Targets Covered to 20x", xlab="Sample", ylim=c(0,100)) -abline(h=80, lty=2) -legend("bottomright", lty=2, legend="80% coverage to 20x") -lowcoverage<-samp$Sample[which(samp$"Target Bases 20x %"<80)] -if(length(lowcoverage)>0){ -text(which(samp$"Target Bases 20x %"<80),samp$"Target Bases 20x %"[which(samp$"Target Bases 20x %"<80)], labels=samp$Sample[which(samp$"Target Bases 20x %"<80)], pos=2, srt=270, cex=.6, col="hotpink") -} - -plot(samp$"Zero Coverage Targets %", main="Zero Coverage", ylab="% Targets with zero coverage", log="y", xlab="Sample", ylim=c(0.01,100)) -abline(h=3, lty=2) -legend("bottomright", lty=2, legend="3% Targets Zero Coverage") -lowcoverage<-c(lowcoverage,samp$Sample[which(samp$"Zero Coverage">3)]) -if(length(which(samp$"Zero Coverage Targets %">3))>0){ -text(which(samp$"Zero Coverage Targets %">3), samp$"Zero Coverage Targets %"[which(samp$"Zero Coverage Targets %">3)], labels=samp$Sample[which(samp$"Zero Coverage Targets %">3)], pos=2, srt=270, cex=.6, col="hotpink") -} - -print("Coverage stats done") -nofp<-lane$"External ID"[which(is.na(lane$"FP LOD"))] - -if(length(which(is.na(lane$"FP LOD")))< nrow(lane)){ - -plot(lane$"FP Confident Calls"~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level # FP calls", main="Fingerprint Calls/Sample Instance", xaxt="n") -medians<-tapply(lane$"FP Confident Calls",lane$"External ID", median, na.rm=TRUE) -points(as.factor(dimnames(medians)[[1]]),medians,col="red", lwd=2) -legend("topleft", legend="Median across sample instances", pch=1, lwd=2, col="red", lty=0) -poorFPcov<-dimnames(medians)[[1]][which(medians<5 )] -if(length(poorFPcov)>0){ -text(which(medians<5), medians[which(medians<5)],poorFPcov, pos=2, srt=270, cex=.6, col="hotpink") -} - -print("1 fp plot") -plot(100*(lane$"FP Confident Matching SNPs"/lane$"FP Confident Calls")~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level % matching FP calls", main="% Confident calls matching for samples with low confident calls", xaxt="n", ylim=c(0,110)) - -print("2 fp plot") - -plot(lane$"FP LOD"~as.factor(lane$"External ID"), xlab="sample", ylab="Sample Fingerprint LOD", main="Fingerprint Pass:Samples", xaxt="n") -offsamps<-lane$"External ID"[which(lane$"FP LOD"<(-3))] -lowfpLOD<-lane$"External ID"[which(lane$"FP LOD"<6)] - -if(length(lowfpLOD)>0){ -text(which(lane$"External ID" %in% lowfpLOD), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="hotpink") -} -print("3 fp plot") - -if(length(lowfpLOD)>0){ -plot((lane$"FP Confident Calls"-lane$"FP Confident Matching SNPs")~as.factor(lane$"External ID"), main="Calls vs Matching Calls for Samples failing FP QC", ylab="# Mismatches", xlab="") -} -if(length(lowfpLOD)>0){ -text(which(lane$"FP LOD"<6), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="RED") -} - - -}else{ -offsamps<-"NO FPDATA" -lowfpLOD<-"NO FP DATA" -poorFPcov<-"NO FP DATA" -} -print("FP stats done") - -boxplot(samp$"Total SNPs", compsamp$"Total SNPs", names=c("Current Set", "All Sets"), ylab="Total SNPs per sample", main="Total SNPs") -standardQuants<-boxplot.stats(compsamp$"Total SNPs")$stats -offSNPs<-samp$Sample[which(samp$"Total SNPs" standardQuants[5])]) -if(length(offSNPs >0)){ - text(1, samp$"Total SNPs"[which(samp$Sample %in% offSNPs)], labels=offSNPs, pos=2, col="hot pink") -} -print("SNP stats done") - -boxplot(samp$"dbSNP %", compsamp$"dbSNP %", names=c("Current Set", "All Sets"), ylab="% SNPs in dbSNP per sample", main="dbSNP Percentage") -standardQuants<-boxplot.stats(compsamp$"dbSNP %")$stats -offdbSNP<-samp$Sample[which(samp$"dbSNP %" standardQuants[5])]) -if(length(offdbSNP >0)){ - text(1, samp$"dbSNP %"[which(samp$Sample %in% offdbSNP)], labels=offdbSNP, pos=2, col="hot pink") -} -print("DBSNP stats done") - -sampDuplication<-sub(pattern="Catch-.*: ", "",samp$"Library Duplication %") -sampDuplication<-as.numeric(sub("%", "", sampDuplication)) -compsampDuplication<-sub(pattern="Catch-.*: ", "",compsamp$"Library Duplication %") -compsampDuplication<-as.numeric(sub("%", "", compsampDuplication)) - -boxplot(sampDuplication, compsampDuplication, names=c("Current Set", "All Sets"), ylab="% Duplication", main="Library Duplication") -standardQuants<-boxplot.stats(compsampDuplication)$stats -offDup<-samp$Sample[which(sampDuplication standardQuants[5])]) -if(length(offDup >0)){ - text(1, sampDuplication[which(samp$Sample %in% offDup)], labels=offDup, pos=2, col="hot pink") -} -print("Duplication stats done") - -allproblemsamples<-unique(c(lowcoverage, poorFPcov, offsamps, lowfpLOD, offSNPs, offdbSNP, offDup, missingLane, missingSamp)) -problemMat<-matrix(c(rep("PASS", length(allproblemsamples)*9)), nrow=length(allproblemsamples)) -rownames(problemMat)<-allproblemsamples -colnames(problemMat)<-c("low coverage", "low fp cov", "Identity Fail", "low FP LOD", "weird SNP count", "weird dbSNP %", "Duplicated", "Missing lane data", "missing agg data") -problemMat[which(rownames(problemMat) %in% lowcoverage),1]<-"FAIL" -problemMat[which(rownames(problemMat) %in% poorFPcov),2]<-"FAIL" -problemMat[which(rownames(problemMat) %in% offsamps),2]<-"FAIL" -problemMat[which(rownames(problemMat) %in% lowfpLOD),4]<-"FAIL" -problemMat[which(rownames(problemMat) %in% offSNPs),5]<-"FAIL" -problemMat[which(rownames(problemMat) %in% offdbSNP),6]<-"FAIL" -problemMat[which(rownames(problemMat) %in% offDup),7]<-"FAIL" -problemMat[which(rownames(problemMat) %in% missingLane),8]<-"FAIL" -problemMat[which(rownames(problemMat) %in% missingSamp),9]<-"FAIL" - -textplot(problemMat, cex=.5) - -write.table(problemMat, file=paste(cmdargs$qcout,"qc.table",sep="."), quote=FALSE, sep="\t") -print("no fp") -print(unique(nofp)) - - - -dev.off() -print("All stats done") diff --git a/R/DataProcessingReport/qcplots.r b/R/DataProcessingReport/qcplots.r deleted file mode 100644 index 80da2fd83..000000000 --- a/R/DataProcessingReport/qcplots.r +++ /dev/null @@ -1,181 +0,0 @@ -.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/') -.libPaths('~/Documents/Sting/R/') - -library(gsalib) -def.par <- par(no.readonly = TRUE) - -titvplot<-function(current){ -par(mfcol=c(1,2)) -titvs<-c() -status<-c() -for(i in c(1:12)){ - load(sprintf("%sexome.%i", path, i)); - info<-subset(data$TiTvVariantEvaluator, Sample!="all") - titvs<-c(titvs, info$tiTvRatio) - status<-c(status, info$Novelty) - print(length(titvs)) - print(length(status)) - } -print(length(unique(current$TiTvVariantEvaluator$Sample))-1) - -length(unique(current$TiTvVariantEvaluator$Sample))-1+length(titvs[which(status=="novel")])->nvalues -print(length(titvs[which(status=="novel")])) -print(nvalues) -plot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(0,4), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample Ti/Tv", xlab="sample") -points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")], col="blue") -points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], col="black") -points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="novel")], pch=16, col="red") -points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="known")], pch=16, col="blue") -points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="all")], pch=16, col="black") - -legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets")) -weirdos<-which(current$TiTvVariantEvaluator$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)]) -if(length(weirdos)>0){ - text(weirdos[c(1:(length(weirdos)/3))],current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} - -boxplot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")],titvs[which(status=="novel")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")],titvs[which(status=="known")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], titvs[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample Ti/TV", xlab="Sample Sets",ylab="Ti/Tv per sample", xaxt="n" ) -axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7) -legend("bottomleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black")) -if(length(weirdos)>0){ -text(rep(c(5,3,1), each=(length(weirdos)/3)),current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} -par(def.par)#- reset to default - -} - - - -variantplots<-function(current){ - par(mfcol=c(1,2)) - -variants<-c() -status<-c() -for(i in c(1:12)){ - load(sprintf("%s/exome.%i", path, i)); - info<-subset(data$CountVariants, Sample!="all") - variants<-c(variants, info$nSNPs) - status<-c(status, info$Novelty) - } - -length(unique(current$CountVariants$Sample))-1+length(variants[which(status=="novel")])->nvalues -plot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(1,25000), log="y", col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample #SNPs", xlab="sample") -points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue") -points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black") -points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="novel")], pch=16, col="red") -points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="known")], pch=16, col="blue") -points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="all")], pch=16, col="black") - -legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets")) - -weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)]) -if(length(weirdos)>0){ - -text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} - -boxplot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],variants[which(status=="novel")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],variants[which(status=="known")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], variants[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #SNPs", xlab="Sample Sets",ylab="SNPs per sample", xaxt="n", ylim=c(10,25000), log="y") -axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7) - if(length(weirdos)>0){ - - text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} -legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black")) -par(def.par)#- reset to default - -} - -heteroplots<-function(current){ - par(mfcol=c(1,2)) - -hets<-c() -status<-c() -for(i in c(1:12)){ - load(sprintf("%s/exome.%i", path, i)); - info<-subset(data$CountVariants, Sample!="all") - hets<-c(hets, info$heterozygosity) - status<-c(status, info$Novelty) - } - -length(unique(current$CountVariants$Sample))-1+length(hets[which(status=="novel")])->nvalues -plot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(-0.0005, 0.0005), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample heterozygosity", xlab="sample") -points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue") -points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black") -points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="novel")], pch=16, col="red") -points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="known")], pch=16, col="blue") -points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="all")], pch=16, col="black") - -legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets")) - -weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)]) - if(length(weirdos)>0){ -text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} - -boxplot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],hets[which(status=="novel")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],hets[which(status=="known")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], hets[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #Heterozygousity", xlab="Sample Sets",ylab="Heterozygousity per sample", xaxt="n") -axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7) -if(length(weirdos)>0){ - -text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink") -} -legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black")) -par(def.par)#- reset to default - -} - -novelAC<-function(current){ -ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")]) -orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")]) -varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")][orderbyAC] -plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark red", main="Novel AC", ylab="# variants (log scale)", xlab="AC (log scale)") - -for(i in c(1:12)){ - load(sprintf("%s/exome.%i", path, i)); - info<-data$SimpleMetricsByAC.metrics - ACs<-sort(info$AC[which(info$Novelty=="novel")]) - orderbyAC<-order(info$AC[which(info$Novelty=="novel")]) - varbyAC<-info$n[which(info$Novelty=="novel")][orderbyAC] - - lines(ACs, varbyAC, col="red") -} - -legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark red", "red")) -} - -knownAC<-function(current){ -ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")]) -orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")]) -varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="known")][orderbyAC] -plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark blue", main="Known AC", ylab="# variants (log scale)", xlab="AC (log scale)") - -for(i in c(1:12)){ - load(sprintf("%s/exome.%i", path, i)); - info<-data$SimpleMetricsByAC.metrics - ACs<-sort(info$AC[which(info$Novelty=="known")]) - orderbyAC<-order(info$AC[which(info$Novelty=="known")]) - varbyAC<-info$n[which(info$Novelty=="known")][orderbyAC] - lines(ACs, varbyAC, col="light blue") -} - -legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark blue", "light blue")) -} - -AllAC<-function(current){ -ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")]) -orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")]) -varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="all")][orderbyAC] -plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="Black", main="All AC", ylab="# variants (log scale)", xlab="AC (log scale)") - -for(i in c(1:12)){ - load(sprintf("%s/exome.%i", path, i)); - info<-data$SimpleMetricsByAC.metrics - ACs<-sort(info$AC[which(info$Novelty=="all")]) - orderbyAC<-order(info$AC[which(info$Novelty=="all")]) - varbyAC<-info$n[which(info$Novelty=="all")][orderbyAC] - - lines(ACs, varbyAC, col="dark grey") -} - -legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("black", "dark grey")) -} - diff --git a/R/DataProcessingReport/tearsheetMaker.r b/R/DataProcessingReport/tearsheetMaker.r deleted file mode 100644 index 68cfe30e1..000000000 --- a/R/DataProcessingReport/tearsheetMaker.r +++ /dev/null @@ -1,34 +0,0 @@ -source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/Tearsheet.R") -cmdargs = gsa.getargs( - list( - title = list(value=NA, doc="Title for the tearsheet"), - tsv = list(value=NA, doc="pipeline tsv file"), - evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"), - tearout = list(value=NA, doc="Output path for tearsheet PDF")#, - ), - doc="Creates a tearsheet" -); - -read.delim(cmdargs$tsv, header=FALSE)->settable - -squids<-unique(settable[,1]) - -lane<-data.frame() -samp<-data.frame() -for(squid in squids){ - gsa.read.squidmetrics(squid, TRUE)->lanemetrics - addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),] - gsa.read.squidmetrics(squid, FALSE)->samplemetrics - addsamps<-samplemetrics[which(samplemetrics$"Sample" %in% settable[,2]),] - lane<-rbind(lane, addlanes) - samp<-rbind(samp, addsamps) -} -print("Picard Data Obtained...") -gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval -gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraFC.eval", sep=""))->FCeval -gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval -print("Evals read") - pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24) - print("PDF created...") -tearsheet() -dev.off() \ No newline at end of file diff --git a/R/GATKRunReport.R b/R/GATKRunReport.R deleted file mode 100644 index 2c9c21dbe..000000000 --- a/R/GATKRunReport.R +++ /dev/null @@ -1,194 +0,0 @@ -require("plotrix") -args = commandArgs(TRUE); - -onCMDLine = ! is.na(args[1]) -if (! is.na(args[3]) ) { name = args[3] } else { name = "" } - -if ( onCMDLine ) { - print(paste("Reading data from", args[1])) - d = read.table(args[1], header=T, sep="\t") - #d$start.time = as.Date(d$start.time) - d$end.time = as.Date(d$end.time) -} # only read into d if its' available, otherwise assume the data is already loaded - -# The unknown records are from the Broad -d$domain.name[d$domain.name == "unknown"] = "broadinstitute.org" - -noRecords <- function(name) { - print(paste("No records", name)) - frame() - title(paste("No records of", name), cex=2) -} - -reportCountingPlot <- function(values, name, moreMargin = 0, ...) { - #print(length(values)) - if ( length(values) > 0 ) { - par(las=2) # make label text perpendicular to axis - oldMar <- par("mar") - par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin. - t = table(factor(values)) - barplot(sort(t), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", log="x", ...) - par("mar" = oldMar) - par("las" = 1) - } else { - noRecords(name) - } -} - -reportConditionalCountingPlot <- function(values, conditions, name, moreMargin = 0, ...) { - if ( length(values) > 0 ) { - t = table(values, conditions) - t = t[, order(colSums(t))] - #print(list(t = t)) - if ( ! is.null(dim(t)) ) { - par(las=2) # make label text perpendicular to axis - oldMar <- par("mar") - par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin. - nconds = dim(t)[2] - cols = rainbow(nconds) - barplot(t, legend.text = T, horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", col=cols, cex=0.5, ...) - par("mar" = oldMar) - par("las" = 1) - } else { - noRecords(name) - } - } else { - noRecords(name) - } -} - - -reportHist <- function(values, name, ...) { - if ( ! all(is.na(values) ) ) - hist(values, main=name, 20, xlab="", col="cornflowerblue", ...) -} - -myTable <- function(x, y, reqRowNonZero = F) { - table <- prop.table(table(x, y), 2) - ncols = dim(table)[2] - - #print(table) - if ( reqRowNonZero ) - table = table[addmargins(table)[1:dim(table)[1],ncols] > 0,] - - return(table) -} - -# todo -- must be robust to smaller sizes - -plotTable <- function(table, name, ...) { - ncols = dim(table)[2] - nrows = dim(table)[1] - if ( ! is.null(nrows) ) { - cols = rainbow(nrows) - tableMin = min(apply(table, 2, min)) - tableMax = max(apply(table, 2, max)) - plot( as.numeric(apply(table, 2, sum)), ylim=c(tableMin, tableMax), type="n", main = name, ylab="Frequency", xlab="Date", xaxt="n", ...) - axis(1, 1:ncols, labels=colnames(table)) - for ( i in 1:nrows ) - points(table[i,], type="b", col=cols[i]) - legend("topright", row.names(table), fill=cols, cex=0.5) - #return(table) - } -} - -RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure - -if ( onCMDLine ) pdf(args[2]) - -successfulRuns <- function(d) { - x <- rep("Successful", length(d$exception.msg)) - x[d$exception.msg != "NA" & d$is.user.exception == "true"] <- "Failed with UserException" - x[d$exception.msg != "NA" & d$is.user.exception == "false"] <- "Failed with StingException" - x[d$exception.msg != "NA" & (d$is.user.exception == "NA" | is.na(d$is.user.exception))] <- "Failed with StingException before UserException code" - return(x) -} - -addSection <- function(name) { - par("mar", c(5, 4, 4, 2)) - frame() - title(name, cex=2) -} - -dropit <- function (d, columns = names(d), ...) -{ - d[columns] = lapply(d[columns], "[", drop=TRUE, ...) - d -} - -generateOneReport <- function(d, header, includeByWeek = T) { - head <- function(s) { - return(paste("Section:", header, "\n", s)) - } - - excepted <- dropit(subset(d, exception.msg != "NA")) - UserExceptions <- dropit(subset(excepted, is.user.exception == "true")) - StingExceptions <- dropit(subset(excepted, is.user.exception == "false" | is.user.exception == "NA" | is.na(is.user.exception))) - - addSection(paste("GATK run report", name, "for", Sys.Date(), "\nwith", dim(d)[1], "run repository records")) - - reportCountingPlot(d$walker.name, head("Walker invocations")) - reportConditionalCountingPlot(d$user.name, d$walker.name, head("Walker invocations by user")) - reportCountingPlot(d$svn.version, head("SVN version")) - reportConditionalCountingPlot(d$svn.version, d$user.name, head("SVN by user")) - - # cuts by time - if ( includeByWeek ) { - plotTable(table(rep("GATK Invocations", length(d$end.time)), cut(d$end.time, "weeks")), head("GATK Invocations by week")) - plotTable(myTable(successfulRuns(d), cut(d$end.time, "weeks")), head("Successful and failing GATK invocations per week")) - - plotTable(myTable(d$svn.version, cut(d$end.time, "weeks")), head("SVN version by week")) - } - plotTable(table(rep("GATK Invocations", length(d$end.time)), d$end.time), head("GATK Invocations by day")) - plotTable(myTable(d$svn.version, d$end.time), head("SVN version by day")) - - # - # Exception handling - # - addExceptionSection <- function(subd, subname, exceptionColor) { - addSection(paste(subname)) - #print(list(subd = length(subd$end.time), name=subname)) - reportCountingPlot(subd$walker.name, head(paste("Walkers with", subname)), col=exceptionColor) - reportCountingPlot(subd$exception.at, head(paste(subname, "locations")), 12, col=exceptionColor) - #reportCountingPlot(subd$exception.msg, head(paste(subname, "messages")), 12, col=exceptionColor) - reportConditionalCountingPlot(subd$user.name, subd$exception.at, head(paste("Walker invocations by user for", subname)), 12) - - if ( includeByWeek && length(subd$end.time) > 0 ) { - plotTable(myTable(subd$walker.name, cut(subd$end.time, "weeks"), reqRowNonZero = T), head(paste("Walkers with", subname,"by week")), col=exceptionColor) - } - } - - addExceptionSection(excepted, "Exceptions", "grey") - reportCountingPlot(excepted$user.name, head("Usernames generating exceptions"), col="grey") - - addExceptionSection(StingExceptions, "StingExceptions", "red") - addExceptionSection(UserExceptions, "UserExceptions", "blue") - - - Gb <- 1024^3 - reportHist(d$total.memory / Gb, head("Used memory")) - reportHist(d$max.memory / Gb, head("Max memory")) - - min <- 60 - reportHist(log10(d$run.time / min), head("Run time (log10[min])")) - - reportCountingPlot(d$user.name, head("user")) - reportCountingPlot(d$domain.name, head("Domain name")) - #reportCountingPlot(d$host.name, head("host")) - - reportCountingPlot(d$java, head("Java version")) - #reportCountingPlot(d$machine, head("Machine")) - #reportCountingPlot(d$working.directory, head("Working directory")) -} - -RUNME = T -if ( RUNME ) { - lastWeek = levels(cut(d$end.time, "weeks"))[-1] - generateOneReport(d, "Overall") - #generateOneReport(subset(d, end.time >= lastWeek), "Just last week to date", includeByWeek = F) -} - -if ( onCMDLine ) dev.off() - - - diff --git a/R/PlotDepthOfCoverage.R b/R/PlotDepthOfCoverage.R deleted file mode 100644 index c4e2b271b..000000000 --- a/R/PlotDepthOfCoverage.R +++ /dev/null @@ -1,144 +0,0 @@ -args <- commandArgs(TRUE) -docBase <- args[1] - -## APPEND THE SUFFIXES ## - -locusStats <- paste(docBase,".sample_locus_statistics",sep="") -targetStats <- paste(docBase,".sample_interval_statistics",sep="") -sampleSum <- paste(docBase,".sample_summary_statistics",sep="") -sampleStats <- paste(docBase,".sample_statistics",sep="") -targetSum <- paste(docBase,".sample_interval_summary",sep="") - -## DEFINE THE PLOTTING FUNCTIONS ## - -PlotDepths <- function(X) { - pdf("Depth_Histogram_All_Samples.pdf") - Y <- as.matrix(X) - colors <- rainbow(nrow(Y),gamma=0.8) - plot(Y[1,],col=colors[1],type="b",xlab="",xaxt="n",ylab="Number of Loci") - axis(1,labels=FALSE) - labels <- colnames(X) - text(1:ncol(Y),par("usr")[3]-(100/6000)*par("usr")[4],srt=45,adj=1,labels=labels,xpd=TRUE,cex=0.7) - for ( jj in 2:nrow(Y) ) { - points(Y[jj,],col=colors[jj],type="b") - } - ymax = par("usr")[4] - xmax = par("usr")[2] - legend(y=0.95*ymax,x=0.8*xmax,col=colors,rownames(X),lty=c(1),cex=0.5) - dev.off() -} - -PlotLocusQuantiles <- function(X) { - pdf("Per_Sample_Coverage_Quantiles.pdf") - Y <- as.matrix(X) - Y <- Y/sum(Y[1,]) - Z <- matrix(nrow=nrow(Y),ncol=ncol(Y)) - for ( ii in 1:nrow(Y) ) { - for ( jj in 1:ncol(Y) ) { - # see how much density is in the remaining columns - Z[ii,jj] = sum(Y[ii,jj:ncol(Y)]) - } - } - - medians = matrix(nrow=1,ncol=ncol(Z)) - quan90 = matrix(nrow=1,ncol=ncol(Z)) - for ( cc in 1:ncol(Z) ) { - medians[cc] = quantile(Z[,cc],0.75) - quan90[cc] = quantile(Z[,cc],1) - } - - plot(t(medians),xlab="",xaxt="n",ylab="Proportion of loci with >X coverage",type="b",col="blue",yaxp=c(0,1,10)) - axis(1,labels=FALSE) - parseColNames <- function(K) { - M = matrix(nrow=1,ncol=length(K)) - number = 0 - for ( lab in K ) { - number = 1 + number - g = unlist(strsplit(lab,split="_")) - M[1,number] = g[2] - } - - return(M) - } - labels <- parseColNames(colnames(X)) - text(1:length(labels),par("usr")[3]-0.025,srt=90,adj=1,labels=labels,xpd=TRUE,cex=(0.8/32)*length(labels),lheight=(0.8/32)*length(labels)) - points(t(quan90),type="b",col="red") - legend(x=floor(0.6*length(labels)),y=1,c("75% of samples","100% of samples"),col=c("red","blue"),lty=c(1,1)) - dev.off() -} - -HistogramMedians <- function(X) { - pdf("Per_Sample_Median_Histogram.pdf") - hist(as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5]))),floor(nrow(X)/2),xlab="Median Coverage",ylab="Number of Samples", main="Median coverage acrosss samples",col="grey") - dev.off() -} - -HeatmapLocusTable <- function(X) { - pdf("Locus_Coverage_HeatMap.pdf") - Y <- as.matrix(X) - heatmap(Y,Rowv=NA,Colv=NA) - dev.off() -} - -PlotMeanMedianQuartiles <- function(X) { - pdf("Per_Sample_Mean_Quantile_Coverage.pdf") - colors <- rainbow(4,start=0.6,end=0.9,gamma=1) - means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3]))) - medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5]))) - thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4]))) - firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6]))) - plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ))) - points(firstQ,col=colors[2],pch=2,type="b") - points(medians,col=colors[3],pch=1,type="b") - points(thirdQ,col=colors[4],pch=2,type="b") - axis(1,labels=FALSE) - labels <- X[1:nrow(X)-1,1] - text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5) - text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE) - legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","25% Quantile","Median","75% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2,1,2)) - dev.off() -} - -PlotOnlyMeanMedian <- function(X) { - pdf("Per_Sample_Mean_Median_Only.pdf") - colors <- rainbow(2,start=0.6,end=0.9,gamma=1) - means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3]))) - medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5]))) - plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(c(max(means),max(medians))))) - points(medians,col=colors[2],pch=1,type="b") - axis(1,labels=FALSE) - labels <- X[1:nrow(X)-1,1] - text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5) - text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE) - legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","Median"),col=colors,lty=c(1),cex=0.8,pch=c(3,2)) - dev.off() -} - -PlotOnlyQuartiles <- function(X) { - pdf("Per_Sample_Quartiles_Only.pdf") - colors <- rainbow(2,start=0.6,end=0.9,gamma=1) - thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4]))) - firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6]))) - plot(thirdQ,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ))) - points(firstQ,col=colors[2],pch=2,type="b") - axis(1,labels=FALSE) - labels <- X[1:nrow(X)-1,1] - text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5) - text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE) - legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("75% Quantile","25% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2)) - dev.off() -} - -## PLOT SAMPLE STATISTICS -TO_PLOT <- read.table(sampleStats) -PlotDepths(TO_PLOT) -PlotLocusQuantiles(TO_PLOT) -## PLOT SAMPLE SUMMARY -TO_PLOT <- read.table(sampleSum,header=TRUE) -PlotMeanMedianQuartiles(TO_PLOT) -PlotOnlyMeanMedian(TO_PLOT) -PlotOnlyQuartiles(TO_PLOT) -HistogramMedians(TO_PLOT) -## PLOT LOCUS STATISTICS -TO_PLOT <- read.table(locusStats) -HeatmapLocusTable(TO_PLOT) diff --git a/R/VariantRecalibratorReport/VariantRecalibratorReport.R b/R/VariantRecalibratorReport/VariantRecalibratorReport.R deleted file mode 100644 index aa4622f01..000000000 --- a/R/VariantRecalibratorReport/VariantRecalibratorReport.R +++ /dev/null @@ -1,254 +0,0 @@ -library(ellipse); -library(hexbin); - -getAnnIndex <- function(d, ann) { - index = -1; - for (i in c(1:length(names(d)))) { - if (names(d)[i] == ann) { - index = i; - } - } - - index; -} - -getClusterAnnIndex <- function(c, ann) { - index = -1; - - for (i in c(1:length(c[[1]]$anns))) { - if (c[[1]]$anns[i] == ann) { - index = i; - } - } - - index; -} - -plotAnn <- function(d.known, d.novel, d.loci, ann) { - index = getAnnIndex(d.known, ann); - - k = hist(d.known[,index], breaks=100, plot=FALSE); - n = hist(d.novel[,index], breaks=100, plot=FALSE); - - plot(k$mids, k$density, type="b", col="blue", ylim=c(0, max(k$density)), lwd=2, xlab=ann, ylab="Density", bty="n"); - points(n$mids, n$density, type="b", col="red", lwd=2); - - if (!is.na(d.loci)) { - legend("topright", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=c(21, 21, 18)); - } else { - legend("topright", c("Known", "Novel"), col=c("blue", "red"), pch=21); - } - - if (!is.na(d.loci)) { - for (i in c(1:nrow(d.loci))) { - points(d.loci[i, index], 0, col="yellow3", pch=18, cex=2.0); - } - } -} - -read.clusters <- function(filename) { - con = file(filename, "r", blocking = FALSE) - lines = readLines(con) - close(con); - - anns = c(); - - annIndex = 1; - clusterIndex = 1; - clusters = c(); - - conversions = c(); - - for (line in lines) { - if (length(grep("ANNOTATION", line)) > 0) { - linePieces = unlist(strsplit(line, ",")); - - anns = c(anns, linePieces[2]); - conversions[[annIndex]] = list(ann = linePieces[2], offset = as.numeric(linePieces[3]), multiplier = as.numeric(linePieces[4])); - - annIndex = annIndex + 1; - } else if (length(grep("CLUSTER", line)) > 0) { - linePieces = unlist(strsplit(line, ",")); - - mixtureWeight = linePieces[2]; - mu = linePieces[3:(3+length(anns)-1)]; - cov = linePieces[(3+length(anns)):length(linePieces)]; - - clusters[[clusterIndex]] = list( - anns = anns, - conversions = conversions, - mixtureWeight = as.numeric(mixtureWeight), - means = as.numeric(mu), - cov = matrix(cov, nrow=length(anns), ncol=length(anns)) - ); - clusterIndex = clusterIndex + 1; - } - } - - clusters; -} - -clusterLimits <- function( vals, defaultMin, defaultMax ) { - x = c(max(defaultMin, min(vals, -2)), min(defaultMax, max(vals, 2))) - print(x) - x -} - -getClusterColor <- function(clusterIndex, nClusters) { - clusterColors(nClusters)[clusterIndex] -} - -clusterColors <- function(nClusters) { - rainbow(nClusters) -} - - -makeAxis <- function( num, vals, off1, mult1, xmin, xmax ) { - #labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5)) - #at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5) - - #from = xmin * mult1 + off1 - #to = xmax * mult1 + off1 - #print(list(off1=off1, mult1=mult1, xmin=xmin, xmax=xmax)) - at = as.integer(seq(from=xmin, to=xmax, by=(abs(xmin) + abs(xmax))/5)) - labels = as.integer(at * mult1 + off1) - #print(list(from=from, to=to, by=(abs(from) + abs(to))/5)) - #print(list(labels=labels, at=at)) - - axis(num, labels=labels, at=at); - -# axis(num, -# labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5)), -# at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5) -# ); -} - -plotClusters <- function(d.known, d.novel, d.loci, c, ann1, ann2, filename, maxVariants = -1) { - index1 = getAnnIndex(d.known, ann1); - index2 = getAnnIndex(d.known, ann2); - - cindex1 = getClusterAnnIndex(c, ann1); - cindex2 = getClusterAnnIndex(c, ann2); - - mult1 = c[[1]]$conversions[[cindex1]]$multiplier; - off1 = c[[1]]$conversions[[cindex1]]$offset; - - mult2 = c[[1]]$conversions[[cindex2]]$multiplier; - off2 = c[[1]]$conversions[[cindex2]]$offset; - - xvalsForLims = clusterLimits(d.known[,index1], -4, 4) - yvalsForLims = clusterLimits(d.known[,index2], -4, 4) - xlims = c(min(xvalsForLims), 1.2*max(xvalsForLims)); - ylims = c(min(yvalsForLims), max(yvalsForLims)); - - # par(mar=c(5, 6, 2, 5)); - plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=xlims, ylim=ylims, xlab=ann1, ylab=ann2, bty="n"); - - mv.known = if (maxVariants == -1 | maxVariants >= nrow(d.known)) { seq(1, nrow(d.known)) } else { as.integer(runif(maxVariants, 1, nrow(d.known)+1))} - mv.novel = if (maxVariants == -1 | maxVariants >= nrow(d.novel)) { 1:nrow(d.novel) } else { as.integer(runif(maxVariants, 1, nrow(d.novel)+1)) } - - print(dim(mv.known)) - print(maxVariants) - - points(((d.known[,index1] - off1)/mult1)[mv.known], ((d.known[,index2] - off2)/mult2)[mv.known], pch=19, cex=0.3, col="#0000FF33"); - points(((d.novel[,index1] - off1)/mult1)[mv.novel], ((d.novel[,index2] - off2)/mult2)[mv.novel], pch=19, cex=0.3, col="#FF000033"); - - nClusters = length(c) - for (clusterIndex in c(1:nClusters)) { - mu = c(c[[clusterIndex]]$means[cindex1], c[[clusterIndex]]$means[cindex2]); - cov = matrix(as.numeric( - matrix( - c( - c[[clusterIndex]]$cov[cindex1,cindex1], - c[[clusterIndex]]$cov[cindex2,cindex1], - c[[clusterIndex]]$cov[cindex1,cindex2], - c[[clusterIndex]]$cov[cindex2,cindex2] - ), - nrow=2, ncol=2 - ) - ), nrow=2, ncol=2 - ); - - weight = c[[clusterIndex]]$mixtureWeight; - color = getClusterColor(clusterIndex, nClusters); - lineweight = ifelse(weight > 0.50, 4, 3); - - points(mu[1], mu[2], pch=21, col=color, cex=0.5); - points(ellipse(t(cov), centre=mu), type="l", lwd=lineweight, col=color); - } - - makeAxis(1, d.novel[,index1], off1, mult1, xvalsForLims[1], xvalsForLims[2]) - makeAxis(2, d.novel[,index2], off2, mult2, yvalsForLims[1], yvalsForLims[2]) - - # add points legend on the lower left - if (!is.na(d.loci)) { - legend("bottomleft", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=19); - } else { - legend("bottomleft", c("Known", "Novel"), col=c("blue", "red"), pch=19); - } - - # add upper right legend with cluster id and weights - weights = round(sapply(c, function(x) x$mixtureWeight),2) - clusterNames = paste("C", paste(1:nClusters), sep="") - clusterLegendNames = paste(clusterNames, weights, sep="-W=") - legend("topright", clusterLegendNames, fill=clusterColors(nClusters)) - - if (!is.na(d.loci)) { - points((d.loci[,index1] - off1)/mult1, (d.loci[,index2] - off2)/mult2, pch=19, cex=0.8, col="yellow3"); - } -} - -args = commandArgs(TRUE); - -plotRoot = args[1]; -if (is.na(plotRoot)) { plotRoot = "test"; } - -clusterFile = args[2]; -if (is.na(clusterFile)) { clusterFile = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized"; } - -vcfTable = args[3]; -if (is.na(vcfTable)) { vcfTable = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.table"; } - -lociFile = args[4]; -if (is.na(lociFile) | lociFile == "NA" ) { lociFile = NA; } - -maxVariants = args[5]; -if (is.na(maxVariants)) { maxVariants = 5000; } -maxVariants = as.integer(maxVariants) - -greedy = args[6] -if (is.na(greedy)) { greedy = -1; } -greedy = as.integer(greedy) - -l = c(); -if (!is.na(lociFile)) { - t = read.table(lociFile, header=TRUE); - l = t$POS; -} - -print("Greedy reading") -d = read.table(vcfTable, header=TRUE, nrows = greedy); -c = read.clusters(clusterFile); - -d.known = d[which(d$DB == 1 | d$ID != "."),]; -d.novel = d[which(d$DB == 0 | d$ID == "."),]; -d.loci = NA; -if (length(l) > 0) { - d.loci = d[which(d$POS %in% l),]; -} - -pdf(paste(plotRoot, ".clusterReport.pdf", sep="")); - -for (ann1 in c[[1]]$anns) { - print(ann1) - plotAnn(d.known, d.novel, d.loci, ann1); - - for (ann2 in c[[1]]$anns) { - if (ann1 != ann2) { - print(paste("-- v ", ann2)) - plotClusters(d.known, d.novel, d.loci, c, ann1, ann2, maxVariants=maxVariants); - } - } -} - -dev.off(); diff --git a/R/VariantReport/VariantReport.R b/R/VariantReport/VariantReport.R deleted file mode 100644 index 7f83eacfd..000000000 --- a/R/VariantReport/VariantReport.R +++ /dev/null @@ -1,442 +0,0 @@ -suppressPackageStartupMessages(library(gsalib)); -suppressPackageStartupMessages(library(gplots)); - -eval.getMetrics <- function(eval, jexl_expression) { - callset.counts = eval$CountVariants[which(eval$CountVariants$evaluation_name == "eval" & eval$CountVariants$comparison_name == "dbsnp" & eval$CountVariants$jexl_expression == jexl_expression),]; - callset.counts.titv = eval$TiTv[which(eval$TiTv$evaluation_name == "eval" & eval$TiTv$comparison_name == "dbsnp" & eval$TiTv$jexl_expression == jexl_expression),]; - - callset.calledCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "all"),]$nVariantLoci; - callset.calledCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; - - callset.knownCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "known"),]$nVariantLoci; - callset.knownCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; - - callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci; - callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; - - callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci; - callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; - - callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci; - callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; - - callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci; - callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; - - metrics = list( - all = callset.calledCounts, - all.titv = callset.calledCounts.titv, - - known = callset.knownCounts, - known.titv = callset.knownCounts.titv, - - novel = callset.novelCounts, - novel.titv = callset.novelCounts.titv, - - filtered.all = callset.allFilteredCounts, - filtered.all.titv = callset.allFilteredCounts.titv, - - filtered.known = callset.knownFilteredCounts, - filtered.known.titv = callset.knownFilteredCounts.titv, - - filtered.novel = callset.novelFilteredCounts, - filtered.novel.titv = callset.novelFilteredCounts.titv - ); -} - -.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) { - if (is.na(filtered.metrics)) { - text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", - name, 100*metrics$all/union$all.withfiltered, - metrics$all, metrics$all.titv, - metrics$known, metrics$known.titv, - metrics$novel, metrics$novel.titv - ); - } else { - text = sprintf("%s (%0.01f%% of union)\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f\n\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", - name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered, - - name, othername, - filtered.metrics$all, filtered.metrics$all.titv, - filtered.metrics$known, filtered.metrics$known.titv, - filtered.metrics$novel, filtered.metrics$novel.titv, - - name, othername, - metrics$all, metrics$all.titv, - metrics$known, metrics$known.titv, - metrics$novel, metrics$novel.titv - ); - } -} - -plot.titlePage <- function(title, author) { - textplot(sprintf("Automated Variant Report\n\n%s\n%s\n%s\n", title, author, Sys.Date())); -} - -.plot.variantTable.getRowText <- function(eval, jexl_expression) { - allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci; - knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci; - novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci; - - allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio; - knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio; - novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio; - - cbind(allVariants, knownVariants, sprintf("%0.2f", knownTiTv), novelVariants, sprintf("%0.2f", novelTiTv)); -} - -plot.variantTable <- function(eval, title) { - aonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[1]); - aonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[1]); - intersection.row = .plot.variantTable.getRowText(eval, "Intersection"); - bonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[2]); - bonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[2]); - - variantsummary = as.data.frame(rbind(bonly.row, bonly.filtered.row, intersection.row, aonly.filtered.row, aonly.row)); - - rownames(variantsummary) = c( - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ); - colnames(variantsummary) = c("counts (all)", "counts (known)", "ti/tv (known)", "counts (novel)", "ti/tv (novel)"); - - textplot(variantsummary); -} - -plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) { - aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]); - aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetrics(eval, "Intersection"); - bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]); - bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]); - - union = list( - all = intersection$all + aonly$all + bonly$all, - all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all - ); - - gsa.plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col); - - text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union)); - text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union)); - text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union)); -} - -plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - } - - if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) { - aonly.filtered = intersection; - aonly.filtered$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) { - bonly.filtered = intersection; - bonly.filtered$n = 0; - } - - #par.def = par(no.readonly = TRUE); - #par(mar=c(5, 5, 3, 5)); - - if (normalize == TRUE) { - norm = aonly$n + aonly.filtered$n + intersection$n + bonly$n + bonly.filtered$n; - matnorm = rbind(aonly$n/norm, aonly.filtered$n/norm, intersection$n/norm, bonly.filtered$n/norm, bonly$n/norm); - - barplot(matnorm, col=col, xlab="Allele count", ylab="", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), ylim=c(0, 1.3), border=NA, yaxt="n", cex=1.3, cex.axis=1.3, cex.lab=1.3); - axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3); - mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3); - } else { - mat = rbind(aonly$n, aonly.filtered$n, intersection$n, bonly.filtered$n, bonly$n); - - #barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, max(intersection$AC)), ylim=c(0, 1), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3); - - barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3); - #axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3); - #mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3); - } - - legend( - "topright", - c( - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ), - fill=rev(col), - cex=1.3 - ); - - #par(par.def); -} - -plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - intersection.all = eval.getMetrics(eval, "Intersection"); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Allele count spectrum (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - } - - if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) { - aonly.filtered = intersection; - aonly.filtered$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) { - bonly.filtered = intersection; - bonly.filtered$n = 0; - } - - loci = (unique(eval$CountVariants$nProcessedLoci))[1]; - ymax = 10*max((1/1000)*loci*(1/c(1:max(intersection$AC)))); - - suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, ymax), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3)); - suppressWarnings(points(intersection$AC, aonly$n + aonly.filtered$n + intersection$n, type="l", lwd=2, col=col[1])); - suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[1])); - suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[3])); - suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[4])); - suppressWarnings(points(intersection$AC, bonly$n + bonly.filtered$n + intersection$n, type="l", lwd=2, col=col[5])); - - #points(c(1:max(intersection$AC)), 0.9*(1/1000)*loci*(1/c(1:max(intersection$AC))), type="l", lwd=2, lty=2, col="black"); - - legend( - "bottomleft", - c( - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])#, - #sprintf("Neutral expectation ( 0.9*(1/1000)*%0.1f*(1/c(1:max(%d))) )", loci, max(intersection$AC)) - ), - lwd=c(2, 2, 3, 2, 2, 2), - lty=c(1, 2, 1, 2, 1, 2), - col=c(rev(col), "black"), - cex=1.3 - ); -} - -eval.getMetricsByAc <- function(eval, jexl, novelty="all") { - piece = subset(eval$MetricsByAc, - evaluation_name == "eval" & - comparison_name == "dbsnp" & - as.character(jexl_expression) == as.character(jexl) & - filter_name == "called" & - novelty_name == novelty - ); -} - -plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - aonly$nTi = 0; - aonly$nTv = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - bonly$nTi = 0; - bonly$nTv = 0; - } - - if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) { - aonly.filtered = intersection; - aonly.filtered$n = 0; - aonly.filtered$nTi = 0; - aonly.filtered$nTv = 0; - } - - if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) { - bonly.filtered = intersection; - bonly.filtered$n = 0; - bonly.filtered$nTi = 0; - bonly.filtered$nTv = 0; - } - - titv.aonly.withfiltered = (aonly$nTi + aonly.filtered$nTi + intersection$nTi)/(aonly$nTv + aonly.filtered$nTv + intersection$nTv); - titv.aonly.withfiltered.finite = titv.aonly.withfiltered[which(is.finite(titv.aonly.withfiltered))]; - - titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv); - titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))]; - - titv.intersection.finite = intersection$Ti.Tv[which(is.finite(intersection$Ti.Tv))]; - - titv.bonly = (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv); - titv.bonly.finite = titv.bonly[which(is.finite(titv.bonly))]; - - titv.bonly.withfiltered = (bonly$nTi + bonly.filtered$nTi + intersection$nTi)/(bonly$nTv + bonly.filtered$nTv + intersection$nTv); - titv.bonly.withfiltered.finite = titv.bonly.withfiltered[which(is.finite(titv.bonly.withfiltered))]; - - titv.min = min(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); - titv.max = max(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); - - plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(0, 4), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3); - points(intersection$AC, (aonly.filtered$nTi + intersection$nTi)/(aonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[1]); - points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[2]); - points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[3]); - points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[4]); - points(intersection$AC, (bonly.filtered$nTi + intersection$nTi)/(bonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[5]); - - abline(h=2.3, lty=2); - mtext("2.3", side=4, at=2.3, cex=0.9); - - abline(h=3.3, lty=2); - mtext("3.3", side=4, at=3.3, cex=0.9); - - #legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); - - legend( - "topleft", - c( - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ), - lwd=c(2, 2, 3, 2, 2), - lty=c(1, 2, 1, 2, 1), - col=rev(col), - cex=1.3 - ); -} - -plot.variantsPerSample2 <- function(eval) { - if (!is.na(eval$MetricsBySample)) { - metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; - metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; - metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),]; - - title = "Calls per sample"; - indices = order(metrics.all$nVariants, decreasing=TRUE); - - plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); - points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black"); - points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue"); - points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red"); - - legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red")); - - axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4); - } -} - -plot.variantsPerSample <- function(eval, novelty_name="all") { - if (!is.na(eval$SimpleMetricsBySample)) { - metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),]; - - title = paste("Calls per sample (", novelty_name, ")", sep=""); - indices = order(metrics$CountVariants, decreasing=TRUE); - - par.def = par(no.readonly = TRUE); - par(mar=c(5, 4, 4, 4)); - - plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); - points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black"); - - axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4); - - par(new=TRUE); - plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n"); - points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black"); - - titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio)); - axis(4, at=titvaxis, labels=titvaxis, las=2); - - par(par.def); - } -} - -argspec = list( - evalRoot = list(value = NA, doc = "Path to the VariantEval R-output (omit the '.Analysis_Type.csv' part of the filename)"), - plotOut = list(value = NA, doc = "Path to the output PDF file"), - title = list(value = NA, doc = "The title of the report"), - author = list(value = NA, doc = "The author of the report") -); - -cmdargs = gsa.getargs(argspec, doc="Take VariantEval R-output and generate a series of plots summarizing the contents"); - -eval = gsa.read.eval(cmdargs$evalRoot); - -pdf(cmdargs$plotOut, width=10, height=10); - -plot.titlePage(cmdargs$title, cmdargs$author); - -plot.variantTable(eval); - -if (length(eval$CallsetNames) > 0) { - # Venn diagram - plot.callsetConcordance(eval); - - # Venn by AC (normalized) - plot.callsetConcordanceByAC(eval, novelty_name="all"); - plot.callsetConcordanceByAC(eval, novelty_name="known"); - plot.callsetConcordanceByAC(eval, novelty_name="novel"); - - # Venn by AC (unnormalized) - plot.callsetConcordanceByAC(eval, novelty_name="all", normalize=FALSE); - plot.callsetConcordanceByAC(eval, novelty_name="known", normalize=FALSE); - plot.callsetConcordanceByAC(eval, novelty_name="novel", normalize=FALSE); - - # Allele count spectrum - plot.alleleCountSpectrum(eval, novelty_name="all"); - plot.alleleCountSpectrum(eval, novelty_name="known"); - plot.alleleCountSpectrum(eval, novelty_name="novel"); - - # Ti/Tv spectrum - plot.titvSpectrum(eval, novelty_name="all"); - plot.titvSpectrum(eval, novelty_name="known"); - plot.titvSpectrum(eval, novelty_name="novel"); - - # Per-sample - #plot.variantsPerSample(eval); -} else { - #plot.variantsPerSample(eval, novelty_name="all"); - #plot.variantsPerSample(eval, novelty_name="known"); - #plot.variantsPerSample(eval, novelty_name="novel"); -} - -dev.off(); diff --git a/R/analyzeConcordance/analyzeConcordance.R b/R/analyzeConcordance/analyzeConcordance.R deleted file mode 100755 index 2cba55d0f..000000000 --- a/R/analyzeConcordance/analyzeConcordance.R +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env Rscript - -args <- commandArgs(TRUE) - -base_name = args[1] -input = args[2] - -d <- read.table(input, header=T) -# separate the data into filtered and unfiltered - -d.filtered <- d[d$filter_type=="filtered",] -d.unfiltered <- d[d$filter_type=="unfiltered",] - -if (nrow(d.filtered) > 0) { - d.display <- d.filtered -} else { - d.display <- d.unfiltered -} - -# -# Plot histograms of the known versus novel Ti/Tv -# - -outfile = paste(base_name, ".histograms.png", sep="") - -if (nrow(d.filtered) > 0) { - nFilterTypes <- 2 -} else { - nFilterTypes <- 1 -} - -bitmap(outfile, width=600, height=(300 * nFilterTypes), units="px") -par(cex=1.1, mfrow=c(1 * nFilterTypes,2)) -nbreaks <- 20 -color <- "grey" -xlim <- c(0,4) - -hist(d.unfiltered$known_titv, nbreaks, col=color, xlim=xlim) -hist(d.unfiltered$novel_titv, nbreaks, col=color, xlim=xlim) - -if (nrow(d.filtered) > 0) { - hist(d.filtered$known_titv, nbreaks, col=color, xlim=xlim) - hist(d.filtered$novel_titv, nbreaks, col=color, xlim=xlim) -} - -dev.off() - -# -# Plot samples in order of novel Ti/Tv versus known Ti/Tv -# - -outfile = paste(base_name, ".novel_vs_known_titv.png", sep="") - -bitmap(outfile, width=600, height=600, units="px") - -d.display <- d.display[order(d.display$novel_titv),] -plot(1:length(d.display$known_titv),d.display$known_titv,type="b",col="blue",ylim=c(0,4), xlab="Sample #", ylab="Ti / Tv") -points(1:length(d.display$novel_titv),d.display$novel_titv,type="b",col="red",ylim=c(0,4)) -legend("bottomright", c("known","novel"), col=c("blue","red"), pch=21) - -dev.off() diff --git a/R/analyzeRodProfile.R b/R/analyzeRodProfile.R deleted file mode 100755 index 4d57d72d3..000000000 --- a/R/analyzeRodProfile.R +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -d = read.table(args[1],head=T) -outfile = args[2] -title = args[3] - -# ----------------------------------------------------------------------------------------------- -# plot timing -# ----------------------------------------------------------------------------------------------- -pdf(outfile, height=5, width=8) -boxplot(d$walltime ~ d$operation, ylab = "Elapsed wall time in seconds [Log10 Scale]", log="y", main=title, cex.axis=0.75) -dev.off() diff --git a/R/assessCallingPerformance.R b/R/assessCallingPerformance.R deleted file mode 100644 index f364bc427..000000000 --- a/R/assessCallingPerformance.R +++ /dev/null @@ -1,72 +0,0 @@ -require("plotrix") -args = commandArgs(TRUE); - -onCMDLine = ! is.na(args[1]) - -file = "sim_calls.table" -info = "interactive R" -if ( onCMDLine ) { - file = args[1] - d <- read.table(file, header=T) - pdf(args[2]) - info = args[3] -} - -d$sim.VAR <- d$sim.AC > 0 -d$called.VAR <- d$called.AC > 0 - -QS = unique(d$sim.Q) -MODES = unique(d$sim.MODE) -NS = unique(d$called.AN / 2) -DEPTHS = unique(d$sim.DP) - -addSection <- function(name) { - par("mar", c(5, 4, 4, 2)) - frame() - title(name, cex=2) -} - -addSection(paste("Calling performance report: nSamples = ", NS, "\n info:", info)) - -results <- expand.grid(Q = QS, mode = MODES, nSamples = NS, depth = DEPTHS) -results$sensitivity = 0 -results$specificity = 0 - -determineRates <- function(raw, Q, mode, depth) { - sub <- subset(raw, sim.Q == Q & sim.MODE == mode & sim.DP == depth) - print(c(Q,mode,depth, dim(sub))) - ct <- table(sub$called.VAR, sub$sim.VAR, dnn = c("called.VAR", "sim.VAR"), useNA = "always") - print(ct) - sensitivity = ct[2,2] / sum(ct[,2]) - specificity = ct[1,1] / sum(ct[,1]) - list(sensitivity = sensitivity, specificity = specificity, ct = ct) -} - -for ( i in 1:(dim(results)[1]) ) { - r <- results[i,] - x <- determineRates(d, r$Q, r$mode, r$depth) - results[i,]$sensitivity = x$sensitivity - results[i,]$specificity = x$specificity -} - -for ( depth in DEPTHS ) { - boxplot(called.AC ~ sim.AC, data = subset(d, called.DP == depth * NS), main = paste("Depth of coverage ", depth), xlab = "Simulation AC", ylab = "Called AC", outwex=0.5, col = "cornflowerblue") - abline(a=0,b=1,col="red",lwd=3) -} -print(results) - -par(mfcol=c(2,1)) -for ( Qt in QS ) { - x <- subset(results, Q == Qt) - print(x) - plot(x$depth, x$sensitivity, type="b", main = paste("Q score", Qt), xlab = "Depth", ylab="Sensitivity") - plot(x$depth, x$specificity, type="b", xlab = "Depth", ylab="Specificity") -} - -par(mfcol=c(1,1)) -plot(0,0, type="n", frame.plot=F, ann=F, axes=F) -addtable2plot(-1, -1, data.frame(Q=results$Q, mode=results$mode, depth=results$depth, sensitivity=format(results$sensitivity, digits=2), specificity = format(results$specificity, digits=2))) - - -if ( onCMDLine ) dev.off() - diff --git a/R/exomePreQC.R b/R/exomePreQC.R deleted file mode 100644 index ce37477a2..000000000 --- a/R/exomePreQC.R +++ /dev/null @@ -1,58 +0,0 @@ -args = commandArgs(TRUE) -onCMDLine = ! is.na(args[1]) - -if ( onCMDLine ) { - reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table' - inputTSV = args[1] - outputPDF = args[2] -} else { - reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table' - inputTSV = 'GoT2D_exomes_batch_005.tsv' - outputPDF = 'T2D.pdf' -} - -require('ggplot2') - -data <- read.table(inputTSV,header=T) - -complete <- read.table(reference_dataset,header=T) -novel <- subset(complete,exon_intervals == "whole_exome_agilent_1.1_refseq_plus_3_boosters"&Novelty=="novel"&FunctionalClass=="all") -selected_samples <- novel$Sample %in% data$sample -novel_with_highlights <- cbind(novel,selected_samples) - -if(onCMDLine) { - fingerprint_lods = list() - for(i in 1:nrow(data)) { - fingerprint_lods[[as.character(data$sample[i])]] <- eval(parse(text=data$FINGERPRINT_LODS[i])) - } - - fingerprint_lod_order = order(unlist(lapply(fingerprint_lods,median),use.names=F)) - - pdf(outputPDF) - boxplot(fingerprint_lods[fingerprint_lod_order],las=3,main='Fingerprint LOD Scores By Sample',xlab='Sample',ylab='LOD Score Distribution',cex.axis=0.65) - - qplot(Sample,Selected_Bases_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='On+Near Bait Bases/PF Bases Aligned per Sample') - qplot(Sample,Mean_Target_Coverage,data=novel_with_highlights,color=selected_samples) + opts(title='Mean Target Coverage per Sample') - qplot(Sample,Zero_Coverage_Targets_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% of Targets with <2x Coverage per Sample') - qplot(Sample,Fold_80_Base_Penalty,data=novel_with_highlights,color=selected_samples) + opts(title='Fold 80 Base Penalty per Sample') - qplot(Sample,Target_Bases_20x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >20x Coverage per Sample') - qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample') - qplot(Sample,PF_HQ_Error_Rate,data=novel_with_highlights,color=selected_samples) + opts(title='% HQ Bases mismatching the Reference per Sample') - qplot(Sample,Mean_Read_Length,data=novel_with_highlights,color=selected_samples) + opts(title='Median Read Length per Sample') - qplot(Sample,Bad_Cycles,data=novel_with_highlights,color=selected_samples) + opts(title='# Bad Cycles per Sample') - qplot(Sample,Strand_Balance_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned to the + Strand per Sample') - qplot(Sample,Total_SNPs,data=novel_with_highlights,color=selected_samples) + opts(title='# SNPs called per Sample') - qplot(Sample,dbSNP_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% SNPs in dbSNP per Sample') - qplot(PCT_DBSNP,data=data,geom="histogram") + opts(title='% SNPs in dbSNP per Sample') - dev.off() -} else { - print('Plotting command-line arguments') - qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample') -} - -#qplot(Sample,Library_Size_HS,data=novel_with_highlights,color=selected_samples) + opts(title='Hybrid Sequencing Library Size per Sample') -#qplot(Sample,MEDIAN_INSERT_SIZE,data=novel_with_highlights,color=selected_samples) + opts(title='Median Insert Size per Sample') -#qplot(Sample,PCT_CHIMERAS,data=novel_with_highlights,color=selected_samples) + opts(title='% Chimera Read Pairs per Sample') -#qplot(Sample,PCT_ADAPTER,data=novel_with_highlights,color=selected_samples) + opts(title='% Unaligned Reads Matching an Adapter Sequence per Sample') -#qplot(Sample,NOVEL_SNPS,data=novel_with_highlights,color=selected_samples) + opts(title='# Novel SNPs called per Sample') -#qplot(Sample,DBSNP_TITV,data=novel_with_highlights,color=selected_samples) + opts(title='TiTv of SNPs in dbSNP per Sample') diff --git a/R/exomeQC.R b/R/exomeQC.R deleted file mode 100644 index eca847e16..000000000 --- a/R/exomeQC.R +++ /dev/null @@ -1,298 +0,0 @@ -library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/") -require("ggplot2") -require("gplots") - -# TODOs: -# Assumes you have indels in your call set. If not you will get errors -# Create pre/post calling sections -# Allow conditional use of the preQCFile (where it's not available) - -args = commandArgs(TRUE) -onCMDLine = ! is.na(args[1]) -LOAD_DATA = T - -# creates an array of c(sampleName1, ..., sampleNameN) -parseHighlightSamples <- function(s) { - return(unlist(strsplit(s, ",", fixed=T))) -} - -preQCFile = NA -if ( onCMDLine ) { - ProjectName = args[1] - VariantEvalRoot = args[2] - outputPDF = args[3] - if ( ! is.na(args[4]) ) - preQCFile = args[4] - if ( ! is.na(args[5]) ) - highlightSamples = parseHighlightSamples(args[5]) - else - highlightSamples = c() -} else { - ProjectName = "InDevelopmentInR" - preQCFile <- NA # "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv" - #VariantEvalRoot <- "qcTestData//ESPGO_Gabriel_NHLBI_eomi_june_2011_batch1" - VariantEvalRoot <- "qcTestData/MC_Engle_11_Samples_06092011" - outputPDF = "bar.pdf" - highlightSamples = c() # parseHighlightSamples("29029,47243") -} - -print("Report") -print(paste("Project :", ProjectName)) -print(paste("VariantEvalRoot :", VariantEvalRoot)) -print(paste("outputPDF :", outputPDF)) -print(paste("preQCFile :", preQCFile)) -print(paste("highlightSamples :", highlightSamples)) - -expandVEReport <- function(d) { - d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2) - d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2) - d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions - return(d) -} - -# ------------------------------------------------------- -# Utilities for displaying multiple plots per page -# ------------------------------------------------------- - -# Viewport (layout 2 graphs top to bottom) -distributePerSampleGraph <- function(distgraph, perSampleGraph, heights = c(2,1)) { - Layout <- grid.layout(nrow = 2, ncol = 1, heights=heights) - grid.newpage() - pushViewport(viewport(layout = Layout)) - subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1) - print(perSampleGraph, vp = subplot(1)) - print(distgraph, vp = subplot(2)) -} - -createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) { - # Metrics by sites: - # bySite -> counts of SNPs and Indels by novelty, with expectations - # byAC -> snps and indels (known / novel) - r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))), - byAC = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAC.eval", sep=""))) - r$byAC$CountVariants$nIndels = r$byAC$CountVariants$nInsertions + r$byAC$CountVariants$nDeletions - r$byAC$TiTvVariantEvaluator$nSNPs = r$byAC$TiTvVariantEvaluator$nTi + r$byAC$TiTvVariantEvaluator$nTv - r$byAC$CountVariants$AC = r$byAC$CountVariants$AlleleCount - r$byAC$TiTvVariantEvaluator$AC = r$byAC$TiTvVariantEvaluator$AlleleCount - return(r) -} - -summaryTable <- function(metricsBySites, metricsBySample) { - # SNP summary statistics - merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator) - sub <- subset(merged, FunctionalClass=="all") - raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio")) - table = cast(raw, Novelty ~ ...) - # doesn't work with textplot - colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio") - return(table) -} - -sampleSummaryTable <- function(metricsBySample) { - # SNP summary statistics - raw <- melt(metricsBySamples, id.vars=c("Novelty", "Sample"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio")) - table = cast(raw, Novelty ~ variable, mean) - table$nSNPs <- round(table$nSNPs, 0) - table$nIndels <- round(table$nIndels, 0) - table$tiTvRatio <- round(table$tiTvRatio, 2) - table$deletionInsertionRatio <- round(table$deletionInsertionRatio, 2) - colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio") - return(table) -} - -overallSummaryTable <- function(metricsBySites, metricsBySamples) { - sitesSummary <- as.data.frame(summaryTable(metricsBySites, metricsBySamples)) - sitesSummary$Metric.Type <- "Sites" - sampleSummary <- as.data.frame(sampleSummaryTable(metricsBySamples)) - sampleSummary$Metric.Type <- "Per-sample avg." - # that last item puts the metric.type second in the list - return(rbind(sitesSummary, sampleSummary)[, c(1,7,2,3,4,5,6)]) -} - -summaryPlots <- function(metricsBySites) { - name = "SNP and Indel count by novelty and allele frequency" - molten = melt(subset(metricsBySites$byAC$CountVariants, Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels"))) - p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable) - p <- p + opts(title = name) - p <- p + scale_y_log10("Number of variants") - p <- p + geom_point(alpha=0.5, size=3) - p <- p + geom_line(size=1) - p <- p + facet_grid(variable ~ ., scales="free") - p <- p + scale_x_continuous("Allele count (AC)") - p2 <- p + scale_x_log10("Allele count (AC)") - p2 <- p2 + opts(title = "") - distributePerSampleGraph(p2, p, c(1,1)) - - # Counts vs. Allele frequency - name = "Variant counts by allele count" - for ( measure in c("nSNPs", "nIndels")) { - molten = melt(subset(metricsBySites$byAC$CountVariants, AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure)) - p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable) - p <- p + opts(title = paste(name, ":", measure)) - p <- p + scale_y_log10("Number of variants") - p <- p + scale_x_log10("Allele count (AC)") - p <- p + geom_point(alpha=0.5, size=4) - p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x) - p <- p + facet_grid(Novelty ~ ., scales="free") - print(p) - } - - name = "Transition / transversion ratio by allele count" - # nSNPs > 0 => requires that we have some data here, otherwise Ti/Tv is zero from VE - minSNPsToInclude = 0 - byACNoAll = subset(metricsBySites$byAC$TiTvVariantEvaluator, Novelty != "all" & AC > 0 & nSNPs > minSNPsToInclude) - p <- ggplot(data=byACNoAll, aes(x=AC, y=tiTvRatio, color=Novelty)) - p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4)) - p <- p + opts(title = name) - p <- p + geom_smooth(size=2) - p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5) - p <- p + scale_x_continuous("Allele count (AC)") - p2 <- p + scale_x_log10("Allele count (AC)") - p2 <- p2 + opts(title = "") - distributePerSampleGraph(p2, p, c(1,1)) - - # SNPs to indels ratio by allele frequency - name = "SNPs to indels ratio by allele frequency" - metricsBySites$byAC$CountVariants$SNP.Indel.Ratio = metricsBySites$byAC$CountVariants$nSNPs / metricsBySites$byAC$CountVariants$nIndels - metricsBySites$byAC$CountVariants$SNP.Indel.Ratio[metricsBySites$byAC$CountVariants$nIndels == 0] = NaN - p <- ggplot(data=subset(metricsBySites$byAC$CountVariants, Novelty == "all" & nSNPs > 0), aes(x=AC, y=SNP.Indel.Ratio)) - p <- p + opts(title = name) - p <- p + scale_y_continuous("SNP to indel ratio") - #p <- p + scale_y_log10() - p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels))) - p <- p + geom_smooth(size=2, aes(weight=nIndels)) - print(p) - - name = "SNP counts by functional class" - molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs"))) - p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass) - p <- p + opts(title = name) - p <- p + scale_y_log10("No. of SNPs") - p <- p + geom_bar(position="dodge") - print(p) -} - -addSection <- function(name) { - par("mar", c(5, 4, 4, 2)) - frame() - title(name, cex=2) -} - -# ------------------------------------------------------- -# read functions -# ------------------------------------------------------- - -createMetricsBySamples <- function(VariantEvalRoot) { - bySampleEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep=""))) - r = merge(bySampleEval$TiTvVariantEvaluator, bySampleEval$CountVariants) - r = merge(r, bySampleEval$CompOverlap) - if ( ! is.na(preQCFile) ) { - preQCMetrics <- read.table(preQCFile, header=T) - r = merge(r, preQCMetrics) - } - # order the samples by nSNPs -- it's the natural ordering. - x = subset(r, Novelty=="all") - r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)]) - - # add highlight info - r$highlight = r$Sample %in% highlightSamples - - #r = merge(merge(preQCMetrics, byACEval$TiTvVariantEvaluator), byACEval$CountVariants) - return(subset(r, Sample != "all")) -} - -# ------------------------------------------------------- -# Per sample plots -# ------------------------------------------------------- - -perSamplePlots <- function(metricsBySamples) { - metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1] - sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes)) - sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted")) - xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") - - measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "deletionInsertionRatio") - name = "by sample" - for ( measure in measures ) { - molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure)) - - # distribution - p1 <- ggplot(data=molten, aes(x=value, group=Novelty, fill=Novelty)) - #p1 <- p1 + opts(title = paste(measure, name)) - p1 <- p1 + geom_density(alpha=0.5) - p1 <- p1 + geom_rug(aes(y=NULL, color=Novelty, position="jitter")) - p1 <- p1 + scale_x_continuous(measure) - - p2 <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value) - p2 <- p2 + opts(title = paste(measure, name)) - p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty)) - p2 <- p2 + sampleTextLabel + sampleTextLabelScale - p2 <- p2 + facet_grid(Novelty ~ ., scales="free") - p2 <- p2 + xAxis - - distributePerSampleGraph(p1, p2) - } - - # known / novel ratio by sample - # TODO -- would ideally not conflate SNPs and Indels - d = subset(metricsBySamples, Novelty == "all" & CompRod == "dbsnp") - title <- opts(title = "Novelty rate by sample") - - # distribution - p1 <- ggplot(data=d, aes(x=compRate)) - p1 <- p1 + geom_density(alpha=0.5) - p1 <- p1 + geom_rug(aes(y=NULL, position="jitter")) - p1 <- p1 + scale_x_continuous("Percent of variants in dbSNP") - - p2 <- ggplot(data=d, aes(x=Sample, y=compRate)) - p2 <- p2 + title - p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty)) - p2 <- p2 + sampleTextLabel + sampleTextLabelScale - p2 <- p2 + geom_rug(aes(x=NULL, position="jitter")) - p2 <- p2 + xAxis - p2 <- p2 + scale_y_continuous("Percent of variants in dbSNP") - distributePerSampleGraph(p1, p2) - - for ( novelty in c("all", "known", "novel") ) { - # TODO -- how can I color it as before? - # TODO -- add marginal distributions? - molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures) - p <- ggplot(data=molten, aes(x=Sample, y=value)) - p <- p + opts(title = paste(name, ":", novelty)) -# p <- p + scale_y_log10("Number of variants") -# p <- p + geom_point(alpha=0.5, size=4) - p <- p + sampleTextLabel + sampleTextLabelScale - p <- p + facet_grid(variable ~ ., scales="free") - # how do we remove the labels? - p <- p + xAxis - print(p) - } -} - -# ------------------------------------------------------- -# Actually invoke the above plotting functions -# ------------------------------------------------------- - -# load the data. -if ( onCMDLine || LOAD_DATA ) { - metricsBySites <- createMetricsBySites(VariantEvalRoot) - metricsBySamples <- createMetricsBySamples(VariantEvalRoot) -} - -if ( ! is.na(outputPDF) ) { - pdf(outputPDF, height=8.5, width=11) -} - -# Table of overall counts and quality -textplot(overallSummaryTable(metricsBySites), show.rownames=F) -title(paste("Summary metrics for project", ProjectName), cex=3) -# textplot(as.data.frame(sampleSummaryTable(metricsBySamples)), show.rownames=F) -# title(paste("Summary metrics per sample for project", ProjectName), cex=3) - -summaryPlots(metricsBySites) -perSamplePlots(metricsBySamples) - -if ( ! is.na(outputPDF) ) { - dev.off() -} - diff --git a/R/generateBySamplePlot.R b/R/generateBySamplePlot.R deleted file mode 100644 index 3dc05c7ae..000000000 --- a/R/generateBySamplePlot.R +++ /dev/null @@ -1,45 +0,0 @@ -######################################################################### -# this script generates a plot of sample depth of coverage over the MHC. -# It's rather specific to that use case, but is a good example of getting -# Loess curve generation to work given a X/Y dataset. -# -# 12/9/2009 -# -Aaron -######################################################################### - -# setup our output PNG -png(filename="bySampleJPName.png",width=1500,height=700,bg="white") - -# input our data set -tbl <- read.csv("docOutJP.csv",header=TRUE) # doc_JP_SN_totalled_clean.csv - -par(las=1) # make all labels horizontal -par(xpd=T, mar=par()$mar+c(0,0,-2,4)) # adjust the margins to accommodate our legend - -# do the initial plot of one column of data -plot(tbl[,1],tbl[,5],xlim=c(18517983,41461957),ylim=c(0,7),type="p",cex=0.2,axes=F,ylab="Average Read Depth Of Coverage",xlab="MHC Location",col=rgb(0,0,0,0.1)) - -# add the custom x and y axis, so we can control their layout -axis(1,pos=0,at=seq(18517983,42061957,by=500000),col.axis="black") -axis(2,pos=18517983,at=seq(0,7,by=1),col="black") - -# setup two color schemes, both with the same colors. One has an alpha of 0.08 for the background points, -# and the other is alpha=1 for the lines (which we want to be vibrant in the foreground) -myColors <- rainbow(30,alpha=0.08) -myColors2 <- rainbow(30) - -# add a legend. There is a better way to do this besides hard-coding it, but it wouldn't render correctly on my machine -legend(x=41000000,y=5,c("NA18940","NA18942","NA18943","NA18944","NA18945","NA18947","NA18948","NA18949","NA18951","NA18952","NA18953","NA18956","NA18959","NA18960","NA18961","NA18964","NA18965","NA18967","NA18968","NA18969","NA18970","NA18971","NA18972","NA18973","NA18974","NA18975","NA18976","NA18980","NA18981","NA19005"),horiz=FALSE,lty=c(1),col=c(myColors2),cex=0.8) - -# loop over the remaining data sets, adding first the points to the graph, then calculating the loess points, and finally combining the points into a line -# the loess smoothing parts were inspired by: http://research.stowers-institute.org/efg/R/Statistics/loess.htm -# adjust the span value to adjust the sensitivity of curve to the local fit. -for (i in 4:33) { - points(tbl[,1],tbl[,i],col=myColors[i],cex=0.2) - y.loess <- loess(y ~ x, span=0.05, data.frame(x=tbl[,1], y=tbl[,i])) - y.predict <- predict(y.loess, data.frame(x=tbl[,1])) - lines(tbl[,1],y.predict,col=myColors2[i]) -} - -# close our png -dev.off() diff --git a/R/phasing/RBP_theoretical.R b/R/phasing/RBP_theoretical.R deleted file mode 100644 index 8d84329d9..000000000 --- a/R/phasing/RBP_theoretical.R +++ /dev/null @@ -1,323 +0,0 @@ -# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var) -# = (1-theta)^2 + theta^2 -# -# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref) -# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta) -pOneSiteIsHet <- function(theta) { - 2 * theta * (1 - theta) -} - -# p = 2 * theta * (1 - theta) -# and mean intra-het distance = 1/p, or d = 1/p -# or: p = 1/d -# or: 2 * theta * (1 - theta) = 1/d -# theta * (1 - theta) = 1/2d -# - theta^2 + theta - 1/2d = 0 -# -# Using the quadratic equation: -# (- b + (b^2 - 4*a*c)^0.5) / 2a -# (-1 + (1 - 2/d)^0.5) / -2 -meanIntraHetDistanceToTheta <- function(d) { - (-1 + (1 - 2/d)^0.5) / -2 -} - -# For consecutive diploid het sites x and y, P(distance(x,y) = k) -# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location). -# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away. -pHetPairAtDistance <- function(k, theta) { - pOneSiteIsHetTheta = pOneSiteIsHet(theta) - dexp(k, pOneSiteIsHetTheta) -} - -# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances: -pHetPairsAtDistances <- function(dists, theta) { - prod(pHetPairAtDistance(dists, theta)) -} - -# Sample numDists distances from the intra-het distance distribution. -# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]: -sampleIntraHetDistances <- function(numDists, theta) { - pOneSiteIsHetTheta = pOneSiteIsHet(theta) - ceiling(rexp(numDists, pOneSiteIsHetTheta)) # round up to get whole-number distances starting from 1 -} - -# For consecutive diploid het sites x and y, P(distance(x,y) <= k) -pHetPairLteDistance <- function(k, theta) { - # Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0: - MIN_DISTANCE = 0 - - Vectorize(function(maxDist) integrate(function(dist) pHetPairAtDistance(dist, theta), lower=MIN_DISTANCE, upper=maxDist)$value)(k) -} - -# Probability (over locations of x on the read) that a paired-end read ALREADY covering site x [with 2 mates of length L reading a fragment of length F] will ALSO cover site y (k bases downstream of x): -# -# If read 1 in mate spans [s1, e1] and read 2 spans [s2, e2], where length(read 1) = e1 - s1 + 1 = length(read 2) = e2 - s2 + 1 = L, then i = s2 - e1 - 1 [BY DEFINITION of i]. -# i == "insert size" is DEFINED AS: F - 2 * L -# -# -# FOR i >= 0: -# -# Assume that read is equally likely to cover x at any of the 2L positions, so uniform probability of 1/2L at each of them. -# P(read r covers (x,y) | r covers x, r = [L,i,L], distance(x,y) = k) -# = sum_p=1^p=L {1/2L * 1{k <= L-p OR L-p+i+1 <= k <= 2L+i-p}} + sum_p=1^p=L {1/2L * 1{k <= L-p}} -# = 1/2L * [2 * sum_p=1^p=L {1{k <= L-p}} + sum_p=1^p=L {1{L-p+i+1 <= k <= 2L+i-p}}] -# = 1/2L * [2 * max(0, L-k) + max(0, min(L, max(0, k-i)) - max(0, k-i-L))] -# -# -pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance <- function(L, F, k) { - if (min(F) < 1) { - stop("Cannot have fragments of size < 1") - } - - # if F < L, then set the effective read length to be F: - L = pmin(L, F) - - i = F - 2 * L - #print(paste("pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= (", paste(L, collapse=", "), "), F= (", paste(F, collapse=", "), "), k= (", paste(k, collapse=", "), ")), i= (", paste(i, collapse=", "), ")", sep="")) - - # If i < 0, then ASSUMING that overlapping region is identical, we can "pretend" to have 2 reads of length L and L+i, with no insert between them. - # Otherwise, leave i alone and L1 = L2 = L: - L1 = L - L2 = L + pmin(0, i) # set effective length of second read to L+i if i < 0 - i = pmax(0, i) # set effective insert size to be >= 0 - - - pWithinSameMate = pmax(0, L1 - k) + pmax(0, L2 - k) - - #maxValueFor_p = pmin(L1, pmax(0, k - i)) - #minValueFor_p_minusOne = pmax(0, k - i - L2) - - maxValueFor_p = pmin(L1, L1 + L2 + i - k) - minValueFor_p_minusOne = pmax(0, L1 - k + i) - pInDifferentMates = pmax(0, maxValueFor_p - minValueFor_p_minusOne) - - (pWithinSameMate + pInDifferentMates) / (L1 + L2) -} - -# Probability of having a fragment of size fragmentSize, where the fragment sizes are normally distributed with mean Fm and standard deviation Fs: -pFragmentSize <- function(fragmentSize, Fm, Fs) { - dnorm(fragmentSize, mean = Fm, sd = Fs) -} - -# Probability (over locations of x on the read, and fragment sizes) that there could exist a paired-end read [with 2 mates of length L covering a fragment] covers both sites x and y (at distance k): -# Integral_from_0^to_INFINITY { pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k) ds } -pFragmentsReadsCanCoverHetPairAtDistance <- function(L, k, Fm, Fs) { - if (Fs != 0) { - pCoverageBySpecificFragment <- function(s) {pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k)} - - MAX_NUM_SD = 10 - maxDistance = MAX_NUM_SD * Fs - minFragmentSize = max(1, Fm - maxDistance) # NOT meaningful to have fragment size < 1 - maxFragmentSize = Fm + maxDistance - - integrate(pCoverageBySpecificFragment, lower=minFragmentSize, upper=maxFragmentSize)$value - } - else {# All fragments are of size exactly Fm: - pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, Fm, k) - } -} - -# Probability (over locations of x on the read, fragment sizes, and read depths) that there exist at least nReadsToPhase paired-end reads covering both sites x and y (at distance k): -# = Sum_from_d=0^to_d=2*meanDepth { p(having d reads | poisson with meanDepth) * p(there at least nReadsToPhase succeed in phasing x,y | given d reads in total) } -# p(having d reads | poisson with meanDepth) = dpois(d, meanDepth) -# p(there are at least nReadsToPhase that succeed in phasing x,y | given d reads in total) = pbinom(nReadsToPhase - 1, k, pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs), lower.tail = FALSE) -pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) { - THRESH = 10^-8 - p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs) - - # deal with numerical issues: - if (abs(1 - p) < THRESH) { - p = 1 - } - else if (abs(p) < THRESH) { - p = 0 - } - - pAtLeastNreadsToPhaseGivenDepth <- function(d) pbinom(nReadsToPhase - 1, d, p, lower.tail = FALSE) - pAtLeastNreadsToPhaseAndDepth <- function(d) dpois(d, meanDepth) * pAtLeastNreadsToPhaseGivenDepth(d) - - minDepth = 0 - maxDepth = 2 * meanDepth - sum(apply(as.matrix(minDepth:maxDepth), 1, pAtLeastNreadsToPhaseAndDepth)) -} - -pDirectlyPhaseHetPairAtDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) { - Vectorize(function(dist) pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k(meanDepth, nReadsToPhase, L, dist, Fm, Fs))(k) -} - -pDirectlyPhaseHetPairAndDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs) { - pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs) * pHetPairAtDistance(k, theta) -} - -# Probability (over locations of x on the read, fragment sizes, read depths, and het-het distances) that that there exist at least nReadsToPhase paired-end reads covering both sites x and y (where the distance between x and y is as per the geometric/exponential distribution): -pDirectlyPhaseHetPair <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs) { - # Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0: - MIN_DISTANCE = 0 - MAX_DISTANCE = Inf - - iRes = integrate(function(k) pDirectlyPhaseHetPairAndDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs), lower=MIN_DISTANCE, upper=MAX_DISTANCE, subdivisions=1000, stop.on.error = FALSE) - if (iRes$message != "OK") { - print(paste("DISTANCE INTEGRATION WARNING: ", iRes$message, sep="")) - } - iRes$value -} - -# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of length(windowDistances)+1 het sites at distances given by windowDistances (where an edge in the transitive path requires at least nReadsToPhase reads): -pPhaseHetPairAtDistanceUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, MIN_PATH_PROB = 10^-6) { - n = length(windowDistances) + 1 # the window size - if (phaseIndex < 2 || phaseIndex > n) { - stop("phaseIndex < 2 || phaseIndex > n") - } - #print(paste("windowDistances= (", paste(windowDistances, collapse=", "), ")", sep="")) - - # A. Pre-compute the upper diagonal of square matrix of n CHOOSE 2 values of: - # pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist(i,j), Fm, Fs) - # - # NOTE that the probabilities of phasing different pairs are NOT truly independent, but assume this for convenience... - # - pPhasePair = matrix(data = 0, nrow = n, ncol = n) - for (i in seq(from=1, to=n-1, by=1)) { - for (j in seq(from=i+1, to=n, by=1)) { - dist = distanceBetweenPair(i, j, windowDistances) - #print(paste("distanceBetweenPair(", i, ", ", j, ", windowDistances) = ", dist, sep="")) - - pPhaseIandJ = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist, Fm, Fs) - pPhasePair[i, j] = pPhaseIandJ - pPhasePair[j, i] = pPhaseIandJ - } - } - #print(pPhasePair) - - # B. We need to consider ALL possible paths from phaseIndex - 1 ---> phaseIndex - # There are: sum_i=0^to_n-2 {n-2 CHOOSE i * i!} such paths. - # Multiply the phasing probs along the path, and sum over all such paths: - # - startNode = phaseIndex - 1 - endNode = phaseIndex - - possibleIntermediateNodes = vector() - if (startNode > 1) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=1, to=startNode-1, by=1)) - if (endNode < n) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=endNode+1, to=n, by=1)) - #print(paste("possibleIntermediateNodes= {", paste(possibleIntermediateNodes, collapse=", "), "}", sep="")) - - pWindowNotPhasing = 1 - library(gtools) - for (subset in powerSet(length(possibleIntermediateNodes))) { - subset = possibleIntermediateNodes[subset] - #print((paste("subset = {", paste(subset, collapse=", "), "}", sep=""))) - - if (length(subset) == 0) { - paths = c() - } - else { - paths = permutations(length(subset), length(subset), v=subset) - } - # Add on the start and the end: - paths = cbind(startNode, paths, endNode) - - for (i in 1:nrow(paths)) { - path = paths[i,] - pSpecificPathPhases = 1 - for (j in seq(from=1, to=length(path)-1, by=1)) { - pSpecificPathPhases = pSpecificPathPhases * pPhasePair[path[j], path[j+1]] - if (pSpecificPathPhases < MIN_PATH_PROB) { # Do a "bounded" calculation [any path that is ALREADY of low probability can be discarded]: - #print(paste("pSpecificPathPhases= ", pSpecificPathPhases, sep="")) - pSpecificPathPhases = 0 - break - } - } - pWindowNotPhasing = pWindowNotPhasing * (1 - pSpecificPathPhases) - - #print((paste("path = (", paste(path, collapse=", "), "), pSpecificPathPhases= ", pSpecificPathPhases, sep=""))) - } - } - - 1 - pWindowNotPhasing -} - -# distance(i,j) = distance(i,i+1) + ... + distance(j-1,j), where distance(i,i+1) is given by windowDistances(i): -distanceBetweenPair <- function(i, j, windowDistances) { - if (i > j) { - tmp = i - i = j - j = tmp - } - if (i < 1 || j > length(windowDistances) + 1) { - stop(paste(i, " = i < 1 || ", j, " = j > length(windowDistances) + 1 = ", length(windowDistances) + 1, sep="")) - } - - sum(windowDistances[i:(j-1)]) -} - -# n = size of set for which power set is to be returned -powerSet <- function(n) { - library(sfsmisc) - - subsets = list() - for (i in seq(from=0, to=(2^n)-1, by=1)) { - subsets[i+1] = list(which(digitsBase(i, base = 2, ndigits = n) == 1)) - } - subsets -} - -pPhaseHetPairAndDistancesUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta) { - p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) * pHetPairsAtDistances(windowDistances, theta) - - #print(paste(p, " = pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ", theta= ", theta, ") * pHetPairsAtDistances(windowDistances= ", paste(windowDistances, collapse=", "), ", theta= ", theta, ")", sep="")) - - p -} - -# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of n het sites at distances distributed as determined by theta (where an edge in the transitive path requires at least nReadsToPhase reads): -pDirectlyPhaseHetPairUsingWindow <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex) { - if (n < 2) { - stop("n < 2") - } - ndim = n-1 - - integrandFunction <- function(windowDistances) {pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta)} - - MIN_DISTANCE = 0 - - # - #MAX_DISTANCE = Inf - # - MAX_TAIL_PROB = 10^-6 - MAX_DISTANCE = 7500 # Only 3e-07 [= 1 - pHetPairLteDistance(7500, 10^-3)] of the het-het pairs are at a distance > 7500 - while (1 - pHetPairLteDistance(MAX_DISTANCE, theta) > MAX_TAIL_PROB) { - MAX_DISTANCE = MAX_DISTANCE * 2 - } - - lower = as.vector(matrix(data=MIN_DISTANCE, nrow=1, ncol=ndim)) - upper = as.vector(matrix(data=MAX_DISTANCE, nrow=1, ncol=ndim)) - - N = 10^4 * ndim^2 - high_dimensional_integrate(ndim, lower, upper, integrandFunction, N, DEBUG = TRUE, PRINT_EVERY = 10^2) -} - -# Use the simplest version of the Monte Carlo method to integrate over a high-dimensional function: -high_dimensional_integrate <- function(ndim, lower, upper, integrandFunction, N = 10^4, DEBUG = FALSE, PRINT_EVERY = 10^3) { - rectangularVolume = prod(upper - lower) - - sum = 0 - for (i in 1:N) { - randVals = as.vector(matrix(data = NA, nrow=1, ncol=ndim)) - for (j in 1:ndim) { - randVals[j] = runif(1, min=lower[j], max=upper[j]) - } - #print(randVals) - - evalFuncVal = integrandFunction(randVals) - sum = sum + evalFuncVal - - if (DEBUG && (i-1) %% PRINT_EVERY == 0) { - estimate = rectangularVolume * (sum / i) - print(paste("high_dimensional_integrate: iteration ", i, ", estimate= ", estimate, sep="")) - } - } - rectangularVolume * (sum / N) -} - -middleOfWindowIndex <- function(windowSize) { - floor(windowSize/2 + 1) -} diff --git a/R/phasing/calcPhasingProbsForWindowDistances.R b/R/phasing/calcPhasingProbsForWindowDistances.R deleted file mode 100644 index 42aaef568..000000000 --- a/R/phasing/calcPhasingProbsForWindowDistances.R +++ /dev/null @@ -1,47 +0,0 @@ -calcPhasingProbsForWindowDistances <- function(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME = NULL) { - WINDOW_SIZES = 2:MAX_WINDOW_SIZE - - phaseProbsPositionWindow = matrix(data = NA, nrow=length(distances), ncol=length(WINDOW_SIZES)) - - for (i in 1:length(distances)) { - # Try to phase (i+1)-st position [relative to i] using varying window sizes: - for (j in 1:length(WINDOW_SIZES)) { - windowSize = WINDOW_SIZES[j] - remainingSize = windowSize - 2 # exlcude i, i+1 - - numOnLeft = i - 1 - numOnRight = (length(distances) + 1) - (i + 2) + 1 - - if (numOnLeft <= numOnRight) { - halfToUse = floor(remainingSize / 2) # skimp on the left [floor], and be generous with the right side - useOnLeft = min(halfToUse, numOnLeft) - useOnRight = min(remainingSize - useOnLeft, numOnRight) - } - else { - halfToUse = ceiling(remainingSize / 2) # be generous with the right side [ceiling] - useOnRight = min(halfToUse, numOnRight) - useOnLeft = min(remainingSize - useOnRight, numOnLeft) - } - startInd = i - useOnLeft # go left from position i - stopInd = i + 1 + useOnRight # go right from position i + 1 - - usePositionRange = seq(from=startInd, to=stopInd, by=1) - useDistancesRange = seq(from=startInd, to=stopInd-1, by=1) # since there are N-1 distances between N consecutive positions - - phaseIndex = which(usePositionRange == i+1) - if (length(phaseIndex) != 1) stop("NO phaseIndex!") - windowDistances = distances[useDistancesRange] - - print(paste("Try to phase position ", i+1, " [relative to ", i, "] using positions: (", paste(usePositionRange, collapse=", "), "), windowDistances= (", paste(windowDistances, collapse=", "), "), [phaseIndex= ", phaseIndex, ", i=", i, "]", sep="")) - p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) - print(paste("phase prob: ", p, sep="")) - phaseProbsPositionWindow[i, j] = p - } - - if (!is.null(FILE_NAME)) { - save(list = ls(all=TRUE), file = paste(FILE_NAME, ".RData", sep="")) - } - } - - list(phaseProbsPositionWindow=phaseProbsPositionWindow, WINDOW_SIZES=WINDOW_SIZES) -} diff --git a/R/phasing/scripts/play_RBP_theoretical.R b/R/phasing/scripts/play_RBP_theoretical.R deleted file mode 100644 index e261f326f..000000000 --- a/R/phasing/scripts/play_RBP_theoretical.R +++ /dev/null @@ -1,54 +0,0 @@ -# -#options(warn=2) -#options(error=recover) -# - -HALF = high_dimensional_integrate(1, -200, 0, dnorm) -print(paste("Should be ~ HALF: ", HALF, sep="")) - - -k = 75 -#theta = 10^-2 -theta = 10^-3 - -p = pHetPairLteDistance(k, theta) -print(paste(p, " = pHetPairLteDistance(k= ", k, ", theta= ", theta, ")", sep="")) - - -L = 76 -fragmentSize = 452 - - -p = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, fragmentSize, k) -print(paste(p, " = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= ", L, ", fragmentSize= ", fragmentSize, ", k= ", k, ")", sep="")) - -Fm = 392 -Fs = 44 - -p = pFragmentSize(300, Fm, Fs) -print(paste(p, " = pFragmentSize(300, Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) - - -p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs) -print(paste(p, " = pFragmentsReadsCanCoverHetPairAtDistance(L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) - - -meanDepth = 65 -nReadsToPhase = 1 -p = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs) -print(paste(p, " = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) - - -p = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, Fm, Fs) -print(paste(p, " = pDirectlyPhaseHetPair(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) - - -windowDistances = c(100, 100, 100, 100, 100) -phaseIndex = 2 -p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) -print(paste(p, " = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) - - - -traceback() -warnings() diff --git a/R/phasing/scripts/plot.intraHetDistancesDistrib.R b/R/phasing/scripts/plot.intraHetDistancesDistrib.R deleted file mode 100644 index ffe0c698c..000000000 --- a/R/phasing/scripts/plot.intraHetDistancesDistrib.R +++ /dev/null @@ -1,13 +0,0 @@ -theta = 10^-3 -params = paste("theta= ", theta, sep="") - -MIN_DIST = 1 -MAX_DIST = 10^4 -BY_DIST = 10 -DISTANCES = seq(from=MIN_DIST, to=MAX_DIST+BY_DIST, by=BY_DIST) -freqAtLteDist = pHetPairLteDistance(DISTANCES, theta) - -scatter(DISTANCES, freqAtLteDist, "intraHetDistancesDistrib", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", main=params) - - -save(list = ls(all=TRUE), file = "intraHetDistancesDistrib.RData") diff --git a/R/phasing/scripts/plot.testDepths.R b/R/phasing/scripts/plot.testDepths.R deleted file mode 100644 index 8d747a248..000000000 --- a/R/phasing/scripts/plot.testDepths.R +++ /dev/null @@ -1,39 +0,0 @@ -theta = 10^-3 - -Fm_BASE = 392 - 2 * 101 # The mean insert size == 190 -Fs = 44 - -nReadsToPhase = 1 - -params = paste("nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="") - - - -MEAN_DEPTHS = 0:65 -NUM_DEPTHS = length(MEAN_DEPTHS) - -READ_LENGTHS = c(18, 36, 76, 101, 125, 150, 175, 200, 400, 800, 1000) -READ_LENGTHS = rev(READ_LENGTHS) -NUM_READ_LENGTHS = length(READ_LENGTHS) - -depthsX = list() -depthsY = list() -depthsLeg = vector() - -for (i in 1:NUM_READ_LENGTHS) { - pPhaseDepth = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_DEPTHS)) - Fm = Fm_BASE + 2 * READ_LENGTHS[i] - for (j in 1:NUM_DEPTHS) { - pPhaseDepth[j] = pDirectlyPhaseHetPair(MEAN_DEPTHS[j], nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs) - } - depthsX[i] = list(MEAN_DEPTHS) - depthsY[i] = list(pPhaseDepth) - depthsLeg[i] = paste("L= ", READ_LENGTHS[i], sep="") -} - -scatter(depthsX, depthsY, "testDepths", xlab="Mean depth", ylab="Phaseability", main=params, leg=depthsLeg, legPos="topleft", width=14, height=7, type="b") - - - - -save(list = ls(all=TRUE), file = "testDepths.RData") diff --git a/R/phasing/scripts/plot.testFragments.R b/R/phasing/scripts/plot.testFragments.R deleted file mode 100644 index 7ae39c44c..000000000 --- a/R/phasing/scripts/plot.testFragments.R +++ /dev/null @@ -1,47 +0,0 @@ -theta = 10^-3 - -L = 101 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, sep="") - - - -MEAN_SIZES = seq(1,2000,20) -STD_SIZES = seq(0,200,5) - - -testFragments = matrix(nrow=length(MEAN_SIZES), ncol=length(STD_SIZES)) -for (i in 1:length(MEAN_SIZES)) { - test_mean_fragment_size = MEAN_SIZES[i] - print(paste("test_mean_fragment_size: ", test_mean_fragment_size, sep="")) - for (j in 1:length(STD_SIZES)) { - test_std_fragment_size = STD_SIZES[j] - print(paste("test_std_fragment_size: ", test_std_fragment_size, sep="")) - - testFragments[i,j] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, test_mean_fragment_size, test_std_fragment_size) - } -} - - -pdf('testFragments.pdf') - -library(gplots) -heatmap.2(testFragments, ylab = "Mean fragment size", xlab = "Standard deviation fragment size", labRow = MEAN_SIZES, labCol = STD_SIZES, Rowv = NA, Colv = NA, dendrogram = "none", scale="none", revC = FALSE, density.info="none", trace="none", main=params) - -library(scatterplot3d) -xMeans = as.vector(t(matrix(rep.int(MEAN_SIZES, length(STD_SIZES)), ncol = length(STD_SIZES)))) -yStds = rep.int(STD_SIZES, length(MEAN_SIZES)) -zPhaseRate = as.vector(t(testFragments)) -scatterplot3d(xMeans, yStds, zPhaseRate, xlab = "Mean fragment size", ylab = "Standard deviation fragment size", zlab = "Phasing rate", main=params) - -bestCombo = which.max(zPhaseRate) -print(paste("For ", params, ", BEST choice gives phaseability of ", zPhaseRate[bestCombo], " using mean fragment = ", xMeans[bestCombo], ", std. fragment = ", yStds[bestCombo], sep = "")) -dev.off() - - - - -save(list = ls(all=TRUE), file = "testFragments.RData") diff --git a/R/phasing/scripts/plot.testIntraHetDistances.R b/R/phasing/scripts/plot.testIntraHetDistances.R deleted file mode 100644 index cb5f857c4..000000000 --- a/R/phasing/scripts/plot.testIntraHetDistances.R +++ /dev/null @@ -1,25 +0,0 @@ -L = 101 - -Fm = 392 -Fs = 44 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") - - - -MEAN_INTRA_HET_DISTANCES = seq(from=2, to=20002, by=50) -THETAS = meanIntraHetDistanceToTheta(MEAN_INTRA_HET_DISTANCES) -NUM_THETAS = length(THETAS) - -pPhaseTheta = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_THETAS)) -for (i in 1:NUM_THETAS) { - pPhaseTheta[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, THETAS[i], Fm, Fs) -} -scatter(MEAN_INTRA_HET_DISTANCES, pPhaseTheta, "testIntraHetDistances", xlab="Mean intra-het distance", ylab="Phaseability", main=params, type="b") - - - -save(list = ls(all=TRUE), file = "testIntraHetDistances.RData") diff --git a/R/phasing/scripts/plot.testReadLengths.R b/R/phasing/scripts/plot.testReadLengths.R deleted file mode 100644 index fb3316749..000000000 --- a/R/phasing/scripts/plot.testReadLengths.R +++ /dev/null @@ -1,24 +0,0 @@ -theta = 10^-3 - -Fm_BASE = 392 - 2 * 101 # The mean insert size == 190 -Fs = 44 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="") - - -READ_LENGTHS = seq(from=30, to=1000, by=10) -NUM_READ_LENGTHS = length(READ_LENGTHS) - -pPhaseReadLength = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_READ_LENGTHS)) -for (i in 1:NUM_READ_LENGTHS) { - Fm = Fm_BASE + 2 * READ_LENGTHS[i] - pPhaseReadLength[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs) -} -scatter(READ_LENGTHS, pPhaseReadLength, "testReadLengths", xlab="Read length", ylab="Phaseability", main=params, type="b") - - - -save(list = ls(all=TRUE), file = "testReadLengths.RData") diff --git a/R/phasing/scripts/plot.testSpecificDistances.R b/R/phasing/scripts/plot.testSpecificDistances.R deleted file mode 100644 index 8dfc0b2a6..000000000 --- a/R/phasing/scripts/plot.testSpecificDistances.R +++ /dev/null @@ -1,19 +0,0 @@ -L = 101 - -Fm = 392 -Fs = 44 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") - - -DISTANCES = 0:1000 -pPhaseHetPairAtDistWithRead = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, DISTANCES, Fm, Fs) - -scatter(DISTANCES, pPhaseHetPairAtDistWithRead, "testSpecificDistances", xlab="Intra-het distance", ylab="Phaseability", main=params) - - - -save(list = ls(all=TRUE), file = "testSpecificDistances.RData") diff --git a/R/phasing/scripts/plot.testSpecificFragments.R b/R/phasing/scripts/plot.testSpecificFragments.R deleted file mode 100644 index 099cf5b3f..000000000 --- a/R/phasing/scripts/plot.testSpecificFragments.R +++ /dev/null @@ -1,8 +0,0 @@ -L = 76 -k = 75 -params = paste("L= ", L, ", k= ", k, sep="") - -FRAGMENT_SIZES = 0:100 + 2 * L -pCoverHetPairWithRead = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, FRAGMENT_SIZES, k) - -scatter(FRAGMENT_SIZES, pCoverHetPairWithRead, "testSpecificFragments", xlab="Fragment size", ylab="Probability of covering het pair", main=params) diff --git a/R/phasing/scripts/plot.testWindows.R b/R/phasing/scripts/plot.testWindows.R deleted file mode 100644 index dd9cc1eee..000000000 --- a/R/phasing/scripts/plot.testWindows.R +++ /dev/null @@ -1,32 +0,0 @@ -theta = 10^-3 - -Fm = 392 -Fs = 44 - -L = 101 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, sep="") - -# -#options(warn=2) -#options(error=recover) -# - -MAX_WINDOW_SIZE = 10 - -WINDOW_SIZES = 2:MAX_WINDOW_SIZE -NUM_WINDOW_SIZES = length(WINDOW_SIZES) - -pPhaseWindow = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_WINDOW_SIZES)) -for (i in 1:NUM_WINDOW_SIZES) { - n = WINDOW_SIZES[i] - phaseIndex = middleOfWindowIndex(n) - pPhaseWindow[i] = pDirectlyPhaseHetPairUsingWindow(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex) - - save(list = ls(all=TRUE), file = "testWindows.RData") -} - -scatter(WINDOW_SIZES, pPhaseWindow, "testWindows", xlab="Window size", ylab="Phaseability", main=params, type="b") diff --git a/R/phasing/scripts/plot.theoretical_window.R b/R/phasing/scripts/plot.theoretical_window.R deleted file mode 100644 index f14acaf3f..000000000 --- a/R/phasing/scripts/plot.theoretical_window.R +++ /dev/null @@ -1,28 +0,0 @@ -L = 101 - -Fm = 392 -Fs = 44 - -meanDepth = 65 -nReadsToPhase = 1 - -theta = 10^-3 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") - - -MAX_NUM_DISTS = 10^4 -distances = sampleIntraHetDistances(MAX_NUM_DISTS, theta) -print(paste("Using ", MAX_NUM_DISTS, " THEORETICAL distances...", sep="")) - - -MAX_WINDOW_SIZE = 10 -FILE_NAME = "theoretical_window" - -phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME) -phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow -WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES - -phaseProbsWindow = colMeans(phaseProbsPositionWindow) - -scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b") diff --git a/R/phasing/scripts/plot.theoretical_window_on_empirical.R b/R/phasing/scripts/plot.theoretical_window_on_empirical.R deleted file mode 100644 index b1463538b..000000000 --- a/R/phasing/scripts/plot.theoretical_window_on_empirical.R +++ /dev/null @@ -1,30 +0,0 @@ -L = 101 - -Fm = 392 -Fs = 44 - -meanDepth = 65 -nReadsToPhase = 1 - -params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") - - -distances = scan("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt", what=list(dist=0)) -distances = distances$dist - -MAX_NUM_DISTS = 10^4 -NUM_DISTS_TO_USE = min(MAX_NUM_DISTS, length(distances)) -distances = distances[1:NUM_DISTS_TO_USE] -print(paste("Using ", NUM_DISTS_TO_USE, " EMPIRICAL distances...", sep="")) - - -MAX_WINDOW_SIZE = 10 -FILE_NAME = "theoretical_window_on_empirical" - -phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME) -phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow -WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES - -phaseProbsWindow = colMeans(phaseProbsPositionWindow) - -scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b") diff --git a/R/phasing/scripts/plot_all.theoretical_and_empirical.R b/R/phasing/scripts/plot_all.theoretical_and_empirical.R deleted file mode 100644 index 31b093578..000000000 --- a/R/phasing/scripts/plot_all.theoretical_and_empirical.R +++ /dev/null @@ -1,181 +0,0 @@ -NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE)) -NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site - - -# -#USE_EMPIRICAL_WINDOWS = c(10, 2) -# -USE_EMPIRICAL_WINDOWS = c(2) - - -TWO_COLORS = c("red", "darkgreen") - - -###################################################################### -# Phasing as a function of SPECIFIC intra-het distances: -###################################################################### -load("testSpecificDistances.RData") - -MAX_DISTANCE = 10^3 -PQ_PHASING_THRESH = 10.0 - -distances = list() -phaseRateDistances = list() -distancesLeg = vector() - -for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { - n = USE_EMPIRICAL_WINDOWS[nextIndex] - n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0)) - n_distance <- n_locDistancePQReadsWindow$distance - n_PQ <- n_locDistancePQReadsWindow$PQ - - distanceVector = sort(unique(n_distance)) - distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)] - numDists = length(distanceVector) - - phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists)) - - print(paste("numDists= ", numDists, sep="")) - print(paste(distanceVector, collapse=", ")) - - for (i in 1:numDists) { - d = distanceVector[i] - print(paste("d= ", d, sep="")) - - dInds = which(n_distance == d) - phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds) - } - - distances[nextIndex] = list(distanceVector) - phaseRateDistances[nextIndex] = list(phasedFractionVector) - distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="") -} - -nextIndex = nextIndex+1 -distances[nextIndex] = list(DISTANCES) -phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead) -distancesLeg[nextIndex] = "Theoretical (window = 2)" # params - -scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS) - - - -###################################################################### -# Phasing as a function of depth: -###################################################################### -load("testDepths.RData") - -depths = list() -phaseRateDepths = list() -depthsLeg = vector() - -for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { - n = USE_EMPIRICAL_WINDOWS[nextIndex] - RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0)) - depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc) - phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES) - depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="") -} - -nextIndex = nextIndex+1 -useLength = which(READ_LENGTHS == 101) -depths[nextIndex] = depthsX[useLength] -phaseRateDepths[nextIndex] = depthsY[useLength] -depthsLeg[nextIndex] = "Theoretical (window = 2)" # params - -scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) - - - -###################################################################### -# Distribution of intra-het distances: -###################################################################### -load("intraHetDistancesDistrib.RData") - -empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1 -empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST - -empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE) -empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances) - -scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS) - - - -###################################################################### -# Phasing as a function of MEAN intra-het distance: -###################################################################### -load("testIntraHetDistances.RData") - -hetDistances = list() -phaseRateHetDistances = list() -hetDistancesLeg = vector() - -for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { - n = USE_EMPIRICAL_WINDOWS[nextIndex] - meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0)) - - hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist) - phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased) - hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="") -} - -nextIndex = nextIndex+1 -hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES) -phaseRateHetDistances[nextIndex] = list(pPhaseTheta) -hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params - -scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS) - -scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000)) - - -###################################################################### -# Phasing as a function of window size: -###################################################################### -load("theoretical_window_on_empirical.RData") - -windows = list() -phaseRateWindows = list() -windowsLeg = vector() - -NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE)) -NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE)) -NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome - - -windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0)) -windows[1] = list(windowPhasedConsistent$window) -phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES) -windowsLeg[1] = paste("HiSeq", sep="") - - -windows[2] = list(WINDOW_SIZES) -phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow))) -windowsLeg[2] = "Theoretical" # params - -scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) - - - -# Use numerical integration over theoretical distances distribution: -load("testWindows.RData") - -doneInds = which(pPhaseWindow != -1) - -windows[2] = list(WINDOW_SIZES[doneInds]) -phaseRateWindows[2] = list(pPhaseWindow[doneInds]) -windowsLeg[2] = "Theoretical" # params - -scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) - - - -# Use theoretical sampling of distances: -load("theoretical_window.RData") - -windows[2] = list(WINDOW_SIZES) -phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow))) -windowsLeg[2] = "Theoretical" # params - -scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) diff --git a/R/plot_Annotations_BinnedTruthMetrics.R b/R/plot_Annotations_BinnedTruthMetrics.R deleted file mode 100644 index 9f9ee290c..000000000 --- a/R/plot_Annotations_BinnedTruthMetrics.R +++ /dev/null @@ -1,190 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -annotationName = args[2] -minBinCutoff = as.numeric(args[3]) -medianNumVariants = args[4] - -c <- read.table(input, header=T) - -all = c[c$numVariants>minBinCutoff & c$category=="all",] -novel = c[c$numVariants>minBinCutoff & c$category=="novel",] -dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] -truth = c[c$numVariants>minBinCutoff & c$category=="truth",] - -# -# Calculate min, max, medians -# - -d = c[c$numVariants>minBinCutoff,] -ymin = min(d$titv) -ymax = max(d$titv) -xmin = min(d$value) -xmax = max(d$value) -m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) -ma = all[all$value > m,] -mb = all[all$value < m,] -m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) -m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) -if(medianNumVariants == "true") { -vc = cumsum( all$numVariants/sum(all$numVariants) ) -m10 = all$value[ max(which(vc<=0.10)) ] -m25 = all$value[ max(which(vc<=0.25)) ] -m = all$value[ max(which(vc<=0.5)) ] -m75 = all$value[ min(which(vc>=0.75)) ] -m90 = all$value[ min(which(vc>=0.90)) ] -} - -# -# Plot TiTv ratio as a function of the annotation -# - -outfile = paste(input, ".TiTv.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -points(novel$value,novel$titv,col="green",pch=20) -points(dbsnp$value,dbsnp$titv,col="blue",pch=20) -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(truth$value,truth$titv,col="magenta",pch=20) -legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) -} else { -legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) -} -dev.off() - -# -# Plot TiTv ratio as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".TiTv_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -points(novel$value,novel$titv,col="green",pch=20) -points(dbsnp$value,dbsnp$titv,col="blue",pch=20) -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(truth$value,truth$titv,col="magenta",pch=20) -legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) -} else { -legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) -} -dev.off() - -# -# Plot dbsnp and true positive rate as a function of the annotation -# - -ymin = min(all$dbsnp) -ymax = max(all$dbsnp) -outfile = paste(input, ".truthRate.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -yLabel = "DBsnp Rate" -if( sum(all$truePositive==0) != length(all$truePositive) ) { -t = all[all$truePositive>0,] -yLabel = "DBsnp/True Positive Rate" -ymin = min(min(all$dbsnp),min(t$truePositive)) -ymax = max(max(all$dbsnp),max(t$truePositive)) -} -plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(t$value,t$truePositive,col="magenta",pch=20); -legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) -} -dev.off() - -# -# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".truthRate_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -yLabel = "DBsnp Rate" -if( sum(all$truePositive==0) != length(all$truePositive) ) { -yLabel = "DBsnp/Truth Rate" -} -plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(t$value,t$truePositive,col="magenta",pch=20); -legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) -} -dev.off() - -# -# Plot histogram of the annotation's value -# - -outfile = paste(input, ".Histogram.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -dev.off() - -# -# Plot histogram of the annotation's value, log scale on x-axis -# - -outfile = paste(input, ".Histogram_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -dev.off() diff --git a/R/plot_ClusterReport.R b/R/plot_ClusterReport.R deleted file mode 100755 index f2288386f..000000000 --- a/R/plot_ClusterReport.R +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -annotationName = args[2] - -data = read.table(input,sep=",",head=T) - -outfile = paste(input, ".ClusterReport.pdf", sep="") -pdf(outfile, height=7, width=8) - -maxP = max(data$knownDist, data$novelDist) - -plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs") -points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2) -legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue")) -dev.off() diff --git a/R/plot_GATK_performance_log.R b/R/plot_GATK_performance_log.R deleted file mode 100644 index 1cda4c5d2..000000000 --- a/R/plot_GATK_performance_log.R +++ /dev/null @@ -1,67 +0,0 @@ -args = commandArgs(TRUE); - -RUNME = F -onCMDLine = ! is.na(args[1]) -DATA_FILE = args[1] -DESCRIPTION = args[2] -#OUTPUT_PDF = paste(DATA_FILE, ".pdf", sep="") - -MAX_POINTS = 100000 - -if ( onCMDLine ) { - print(paste("Reading data from", DATA_FILE)) - d = read.table(DATA_FILE, header=T) -} - -#if ( onCMDLine ) pdf(OUTPUT_PDF) - -vec.margin <- function(x) { - l = length(x) - d = x[-1] - x[1:(l-1)] - c(x[1], d[1:(l-1)]) -} - -everyNth <- function(x, n) { - l = dim(x)[1] - m = ceiling(l / n) - print(m) - keep = 1:l %% m == 0 - x[keep,] -} - -l = length(d$units.processed) -d$units.processed.margin = vec.margin(d$units.processed) -#prev = 0 -#for ( i in 1:l ) { -# cur = d$units.processed[i] -# d[i,]$units.processed.margin = cur - prev -# prev = cur -#} - -generateOneReport <- function(d) { - qs = quantile(d$processing.speed, probs = c(0.01, 0.5, 0.99)) - - # unit processing time - if ( onCMDLine ) png(paste(DATA_FILE, ".speed.png", sep=""), width=1080, height=1080) - dpoints = everyNth(d, MAX_POINTS) - plot(dpoints$elapsed.time, dpoints$processing.speed, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Processing speed (seconds per 1M units)", ylim=c(qs[1], qs[3]), type="b", col="cornflowerblue", lwd=2) - abline(h=qs[2], lty=2) - if ( onCMDLine ) dev.off() - - # instantaneous processing speed - if ( onCMDLine ) png(paste(DATA_FILE, ".marginal.png", sep=""), width=1080, height=1080) - running_median_window = 101 - rm = runmed(d$units.processed.margin, running_median_window) - POINT_COL = "#0000AA99" - plot(dpoints$elapsed.time, dpoints$units.processed.margin, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Units processed in last timing interval", type="p", cex = 0.75, col=POINT_COL) - lines(d$elapsed.time, rm, lwd=3, col="red") - legend("topleft", c("Observations", "101-elt running median"), fill=c(POINT_COL, "red")) - if ( onCMDLine ) dev.off() -} - -if ( RUNME ) { - generateOneReport(d) -} - - - diff --git a/R/plot_OptimizationCurve.R b/R/plot_OptimizationCurve.R deleted file mode 100755 index 5eff8a34c..000000000 --- a/R/plot_OptimizationCurve.R +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -targetTITV = as.numeric(args[2]) - -# ----------------------------------------------------------------------------------------------- -# optimization curve -# ----------------------------------------------------------------------------------------------- -data = read.table(input,sep=",",head=T) -maxVars = max(data$numKnown, data$numNovel) -maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV) -maxTITV = min(maxTITV, targetTITV + 1) -minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV) -maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0]) - -outfile = paste(input, ".optimizationCurve.pdf", sep="") -pdf(outfile, height=7, width=8) - -par(mar=c(4,4,1,4),cex=1.3) -plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20) -points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20) -abline(h=targetTITV,lty=3,col="Blue") -axis(side=2,col="DarkBlue") -axis(side=1) -mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4) -legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7) -par(new=T) -plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20) -points(data$pCut, data$numNovel,col="DarkGreen",pch=20) -axis(side=4,col="DarkGreen") -mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4) -legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7) -dev.off() diff --git a/R/plot_Tranches.R b/R/plot_Tranches.R deleted file mode 100755 index a79ddd3ab..000000000 --- a/R/plot_Tranches.R +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -tranchesFile = args[1] -targetTITV = as.numeric(args[2]) -targetSensitivity = as.numeric(args[3]) -suppressLegend = ! is.na(args[4]) - -# ----------------------------------------------------------------------------------------------- -# Useful general routines -# ----------------------------------------------------------------------------------------------- - -MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate - -titvFPEst <- function(titvExpected, titvObserved) { - max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) -} - -titvFPEstV <- function(titvExpected, titvs) { - sapply(titvs, function(x) titvFPEst(titvExpected, x)) -} - -nTPFP <- function(nVariants, FDR) { - return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100))) -} - -leftShift <- function(x, leftValue = 0) { - r = rep(leftValue, length(x)) - for ( i in 1:(length(x)-1) ) { - #print(list(i=i)) - r[i] = x[i+1] - } - r -} - -# ----------------------------------------------------------------------------------------------- -# Tranches plot -# ----------------------------------------------------------------------------------------------- -data2 = read.table(tranchesFile,sep=",",head=T) -data2 = data2[order(data2$novelTiTv, decreasing=F),] -#data2 = data2[order(data2$FDRtranche, decreasing=T),] -cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") -density=c(20, -1, -1, 20) -outfile = paste(tranchesFile, ".pdf", sep="") -pdf(outfile, height=5, width=8) -par(mar = c(5, 5, 4, 2) + 0.1) -novelTiTv = c(data2$novelTITV,data2$novelTiTv) -alpha = 1 - titvFPEstV(targetTITV, novelTiTv) -#print(alpha) - -numGood = round(alpha * data2$numNovel); - -#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100)) -numBad = data2$numNovel - numGood; - -numPrevGood = leftShift(numGood, 0) -numNewGood = numGood - numPrevGood -numPrevBad = leftShift(numBad, 0) -numNewBad = numBad - numPrevBad - -d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE) -#print(d) -barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000)) -#abline(v= d[2,dim(d)[2]], lty=2) -#abline(v= d[1,3], lty=2) -if ( ! suppressLegend ) - legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) - -mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) -mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) -axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0) -axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) - -# plot sensitivity vs. specificity -sensitivity = data2$truthSensitivity -if ( ! is.null(sensitivity) ) { - #specificity = titvFPEstV(targetTITV, novelTiTv) - specificity = novelTiTv - plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)") - abline(h=targetTITV, lty=2) - abline(v=targetSensitivity, lty=2) - #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2) -} - -dev.off() diff --git a/R/plot_indelQuality.R b/R/plot_indelQuality.R deleted file mode 100644 index c7cf60e31..000000000 --- a/R/plot_indelQuality.R +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -covariateName = args[2] - -outfile = paste(input, ".indelQual_v_", covariateName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -c <- read.table(input, header=T) -c <- c[sort.list(c[,1]),] - -# -# Plot qual as a function of the covariate -# - -d.good <- c[c$nBases >= 1000,] -d.1000 <- c[c$nBases < 1000,] -rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh -rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) -theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) -if( length(d.good$nBases) == length(c$nBases) ) { - theTitle = paste("RMSE =", round(rmseAll,digits=3)) -} -# Don't let residual error go off the edge of the plot -d.good$residualError = d.good$Qempirical#-d.good$Qreported -#d.good$residualError[which(d.good$residualError > 10)] = 10 -#d.good$residualError[which(d.good$residualError < -10)] = -10 -d.1000$residualError = d.1000$Qempirical#-d.1000$Qreported -#d.1000$residualError[which(d.1000$residualError > 10)] = 10 -#d.1000$residualError[which(d.1000$residualError < -10)] = -10 -c$residualError = c$Qempirical -#c$residualError[which(c$residualError > 10)] = 10 -#c$residualError[which(c$residualError < -10)] = -10 -pointType = "p" -if( length(c$Covariate) <= 20 ) { - pointType = "o" -} -if( is.numeric(c$Covariate) ) { - plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-0, 50), xlim=c(min(c$Covariate),max(c$Covariate))) - points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", ylim=c(0, 50)) - points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") -} -dev.off() - - -# -# Plot mean quality versus the covariate -# - -outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -pointType = "p" -if( length(c$Covariate) <= 20 ) { - pointType = "o" -} -theTitle = paste("Quality By", covariateName); -if( is.numeric(c$Covariate) ) { - plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) - points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) - points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") -} -dev.off() - -# -# Plot histogram of the covariate -# - -e = d.good -f = d.1000 -outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) -hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) - -lwdSize=2 -if( length(c$Covariate) <= 20 ) { - lwdSize=7 -} else if( length(c$Covariate) <= 70 ) { - lwdSize=4 -} - -if( is.numeric(c$Covariate) ) { - if( length(hst$e.Covariate) == 0 ) { - plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) - } else { - plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) - points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") - } - axis(2,axTicks(2), format(axTicks(2), scientific=F)) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) - plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") - if( length(hst$c.Covariate) > 9 ) { - axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) - } else { - axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) - } - axis(2,axTicks(2), format(axTicks(2), scientific=F)) -} -dev.off() diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R deleted file mode 100644 index a1385ff3f..000000000 --- a/R/plot_residualError_OtherCovariate.R +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -covariateName = args[2] - -outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -c <- read.table(input, header=T) -c <- c[sort.list(c[,1]),] - -# -# Plot residual error as a function of the covariate -# - -d.good <- c[c$nBases >= 1000,] -d.1000 <- c[c$nBases < 1000,] -rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh -rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) -theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) -if( length(d.good$nBases) == length(c$nBases) ) { - theTitle = paste("RMSE =", round(rmseAll,digits=3)) -} -# Don't let residual error go off the edge of the plot -d.good$residualError = d.good$Qempirical-d.good$Qreported -d.good$residualError[which(d.good$residualError > 10)] = 10 -d.good$residualError[which(d.good$residualError < -10)] = -10 -d.1000$residualError = d.1000$Qempirical-d.1000$Qreported -d.1000$residualError[which(d.1000$residualError > 10)] = 10 -d.1000$residualError[which(d.1000$residualError < -10)] = -10 -c$residualError = c$Qempirical-c$Qreported -c$residualError[which(c$residualError > 10)] = 10 -c$residualError[which(c$residualError < -10)] = -10 -pointType = "p" -if( length(c$Covariate) <= 20 ) { - pointType = "o" -} -if( is.numeric(c$Covariate) ) { - plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) - points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) - points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") -} -dev.off() - - -# -# Plot mean quality versus the covariate -# - -outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -pointType = "p" -if( length(c$Covariate) <= 20 ) { - pointType = "o" -} -theTitle = paste("Quality By", covariateName); -if( is.numeric(c$Covariate) ) { - plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) - points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) - points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") -} -dev.off() - -# -# Plot histogram of the covariate -# - -e = d.good -f = d.1000 -outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) -hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) - -lwdSize=2 -if( length(c$Covariate) <= 20 ) { - lwdSize=7 -} else if( length(c$Covariate) <= 70 ) { - lwdSize=4 -} - -if( is.numeric(c$Covariate) ) { - if( length(hst$e.Covariate) == 0 ) { - plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) - } else { - plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) - points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") - } - axis(2,axTicks(2), format(axTicks(2), scientific=F)) -} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice - hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) - plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") - if( length(hst$c.Covariate) > 9 ) { - axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) - } else { - axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) - } - axis(2,axTicks(2), format(axTicks(2), scientific=F)) -} -dev.off() diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R deleted file mode 100644 index 81bc9460d..000000000 --- a/R/plot_residualError_QualityScoreCovariate.R +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) - -input = args[1] -Qcutoff = as.numeric(args[2]) -maxQ = as.numeric(args[3]) -maxHist = as.numeric(args[4]) - -t=read.table(input, header=T) - -# -# Plot of reported quality versus empirical quality -# - -outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") -pdf(outfile, height=7, width=7) -d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] -d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] -d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] -f <- t[t$Qreported < Qcutoff,] -e <- rbind(d.good, d.1000, d.10000) -rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh -rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) -theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) -if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { - theTitle = paste("RMSE =", round(rmseAll,digits=3)); -} -plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") -points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) -points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) -points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) -abline(0,1, lty=2) -dev.off() - -# -# Plot Q empirical histogram -# - -outfile = paste(input, ".quality_emp_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) -hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) -percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) -entropy = -sum(log2(percentBases)*percentBases) -yMax = max(hst$e.nBases) -if(maxHist != 0) { -yMax = maxHist -} -plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") -points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") -axis(2,axTicks(2), format(axTicks(2), scientific=F)) -dev.off() - -# -# Plot Q reported histogram -# - -outfile = paste(input, ".quality_rep_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) -hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) -yMax = max(hst$e.nBases) -if(maxHist != 0) { -yMax = maxHist -} -plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") -points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") -axis(2,axTicks(2), format(axTicks(2), scientific=F)) -dev.off() diff --git a/R/plot_variantROCCurve.R b/R/plot_variantROCCurve.R deleted file mode 100755 index 3469e752e..000000000 --- a/R/plot_variantROCCurve.R +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] - -data = read.table(input,sep=",",head=T) -numCurves = (length(data) - 1)/3 -maxSpec = max(data[,(1:numCurves)*3]) - -outfile = paste(input, ".variantROCCurve.pdf", sep="") -pdf(outfile, height=7, width=7) - -par(cex=1.3) -plot(data$specificity1,data$sensitivity1, type="n", xlim=c(0,maxSpec),ylim=c(0,1),xlab="1 - Specificity",ylab="Sensitivity") -for(iii in 1:numCurves) { - points(data[,iii*3],data[,(iii-1)*3+2],lwd=3,type="l",col=iii) -} -legend("bottomright", names(data)[(0:(numCurves-1))*3+1], col=1:numCurves,lwd=3) -dev.off() diff --git a/R/plotting_library.R b/R/plotting_library.R deleted file mode 100644 index 45a16d10a..000000000 --- a/R/plotting_library.R +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -fileToRead <- args[1] -functionToRun <- args[2] -functionSpecificArgs <- args[3] - -## load the function to run - -if ( funtionToRun == "PlotInterleavedRows" ) { -### PLOT INTERLEAVED ROWS FUNCTION ### -# - expects a file of the form -# -# sample_a \t 0.8 \t 0.6 \t 0.5 -# sample_a \t 0 \t 1 \t 3 -# sample_b \t 0.5 \t 0.3 \t 0.1 -# sample_b \t 1 \t 2 \t 4 -# -# and an argument string -# x_label;y_label;plot_title;base_name_for_pdf -# - end of info - -### PLOT INTERLEAVED ROWS FUNCTION ### -PlotInterleavedRows <- function(inFile,args) { - arglist = unlist(strsplit(args,";")) - xlabel = arglist[1] - ylabel = arglist[2] - title = arglist[3] - outFileBase = arglist[4] - - allPoints <- as.matrix(read.table(inFile)) - # set up colors - colors = rainbow(ncol(allPoints)-1,s=0.8,v=0.8,gamma=0.6,start=0.0,end=0.9) - styles = c(rep(1,ncol(allPoints)-1)) - evalPoints = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints)) - funcVal = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints)) - # convert to two matrices by de-interleaving and transposing - for ( i in 1:(nrow(allPoints)/2) ) { - evalPoints[i,] <- allPoints[2*i,] - funcVal[i,] <- allPoints[2*i-1,] - } - - evalPoints <- t(evalPoints) - funcVal <- t(funcVal) - # plot and put legend on - pdf(paste(outFileBase,"_rplot",".pdf",sep="")) - matplot(evalPoints,funcVal,col=colors,lty=styles,"l",xlab=xlabel,ylab=ylabel) - legend("topright",funcVal[1,],lty=styles,col=colors) - title(main=title,outer=TRUE) - # save - dev.off() -} - -PlotInterleavedRows(fileToRead,functionSpecificArgs) - -} - -if ( functionToRun == "PlotHeatmap" ) { -### PLOT HEATMAP FUNCTION ### -# -# Normally what is meant by "heatmap" is just an image() of the -# matrix; in accordance with that, THIS FUNCTION DOES NOT COMPUTE -# DENDROGRAMS THROUGH HEATMAP(), so no rows and columns are not -# re-ordered, and dendrograms are not displayed. -# -# - expects a file of the form -# -# rentry1 \t rentry2 \t rentry3 \t ... -# colentry1 \t 0.7 \t 0.9 \t 0.4 \t ... -# colentry2 \t 0.8 \t 0.7 \t 0.6 \t ... -# ... -# Note that the rows and columns don't line up. R understands this -# and deals with it. -# Also expects an argument string: -# row_label;column_label;plot_title;base_name_for_pdf -# - end of info - -### PLOT HEATMAP FUNCTION ### -PlotHeatmap <- function(inFile,args) { - arglist = unlist(strsplit(args,split=";")) - row_label = arglist[1] - column_label = arglist[2] - data_rescale_factor <- as.numeric(arglist[3]) - plot_title = arglist[4] - base_name_for_pdf = arglist[5] - image_matrix <- as.matrix(read.table(inFile)) - ## change default colors to include "cool" colors for lower end of spectrum - ## e.g. red ~ near 1, yellow ~ near .75, green ~ near .5, teal ~ near .25 - ## blue ~ near 0 - colors <- rev(rainbow(32,start=0,end=0.6,s=0.9,v=0.9,gamma=0.8)) - pdf(paste(base_name_for_pdf,"_rplot",".pdf",sep="")) - heatmap(image_matrix,Rowv=NA,Colv=NA,ylab=row_label,xlab=column_label,col=colors) - title(main=plot_title,outer=TRUE) - dev.off() -} - -PlotHeatmap(fileToRead,functionSpecificArgs) - -} diff --git a/R/privateMutations.R b/R/privateMutations.R deleted file mode 100644 index 2fe1f8ab3..000000000 --- a/R/privateMutations.R +++ /dev/null @@ -1,61 +0,0 @@ -MAX_AC = 10000 -normHist <- function(d, m) { - x = hist(d$true.ac, breaks=1:20000, plot=F)$counts[1:MAX_AC] - x / sum(x) -} - -f <- function(d, acs) { - cols = rainbow(length(acs), alpha=0.75) - y = normHist(subset(afs, small.ac == acs[1])) - x = 1:length(y) / max(d$true.an) - plot(x, y, type="l", col=cols[1], xlab="True MAF in full population", ylab="Frequency", lwd=3, log="x") - for (i in 2:length(acs)) { - points(x, normHist(subset(afs, small.ac == acs[i])), type="l", col=cols[i], lwd=3) - } - - legend("topright", legend=lapply(acs, function(x) paste("AC =", x)), fill=cols, title="Sub-population") -} - -expected <- function(maxAN, N, eps, ac1scale = F) { - scale = 10 - - f <- function(ps, N) { - co = 2 * N / ( 1 - eps ) - co * ((1 - ps)/(1-eps))^(2 * N - 1) - } - - # these are the points that we'll actually show, but we need to do the calculation - # special for the AC = 1 given the equation actually fits an infinite population - # not a discrete population with max chromosomes - ps = 1:maxAN / maxAN - v = f(ps, N) - v = v / sum(v) - - if ( ac1scale ) { - subps = seq(1, maxAN*scale) / (maxAN * scale) - #print(subps) - subv = f(subps, N) - #print(subv) - #print(v[1:10]) - pBelowAC1 = sum(subv[1:scale] / sum(subv)) - #print(list(pBelowAC1=pBelowAC1, v1=v[1])) - v[1] = v[1] + pBelowAC1 - } - - list(ps = ps, pr = v) -} - -f(afs, c(1,2,3,5,10,50)) - -if ( F ) { -scale = 100 -ex1 = expected(200000, 1000, 1e-8) -ex2 = expected(200000*scale, 1000, 1e-8) -i = 1:(200000*scale) %% scale == 1 -plot(ex2$ps[i], cumsum(ex1$pr), type="l",lty=3,lwd=3, log="x", col="red") -points(ex2$ps[i], cumsum(ex2$pr)[i], type="l",lty=3,lwd=3, log="x") -} - -ex = expected(200000, 1000, 1e-8, T) -points(ex$ps, ex$pr, type="l",lty=3,lwd=3) - diff --git a/R/src/gsalib/DESCRIPTION b/R/src/gsalib/DESCRIPTION deleted file mode 100644 index 6116e8c66..000000000 --- a/R/src/gsalib/DESCRIPTION +++ /dev/null @@ -1,10 +0,0 @@ -Package: gsalib -Type: Package -Title: Utility functions -Version: 1.0 -Date: 2010-10-02 -Author: Kiran Garimella -Maintainer: Kiran Garimella -Description: Utility functions for GATK NGS analyses -License: BSD -LazyLoad: yes diff --git a/R/src/gsalib/R/gsa.error.R b/R/src/gsalib/R/gsa.error.R deleted file mode 100644 index 1c6a56046..000000000 --- a/R/src/gsalib/R/gsa.error.R +++ /dev/null @@ -1,12 +0,0 @@ -gsa.error <- function(message) { - message(""); - gsa.message("Error: **********"); - gsa.message(sprintf("Error: %s", message)); - gsa.message("Error: **********"); - message(""); - - traceback(); - - message(""); - stop(message, call. = FALSE); -} diff --git a/R/src/gsalib/R/gsa.getargs.R b/R/src/gsalib/R/gsa.getargs.R deleted file mode 100644 index 94613bf93..000000000 --- a/R/src/gsalib/R/gsa.getargs.R +++ /dev/null @@ -1,116 +0,0 @@ -.gsa.getargs.usage <- function(argspec, doc) { - cargs = commandArgs(); - - usage = "Usage:"; - - fileIndex = grep("--file=", cargs); - if (length(fileIndex) > 0) { - progname = gsub("--file=", "", cargs[fileIndex[1]]); - - usage = sprintf("Usage: Rscript %s [arguments]", progname); - - if (!is.na(doc)) { - message(sprintf("%s: %s\n", progname, doc)); - } - } - - message(usage); - - for (argname in names(argspec)) { - key = argname; - defaultValue = 0; - doc = ""; - - if (is.list(argspec[[argname]])) { - defaultValue = argspec[[argname]]$value; - doc = argspec[[argname]]$doc; - } - - message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc)); - } - - message(""); - - stop(call. = FALSE); -} - -gsa.getargs <- function(argspec, doc = NA) { - argsenv = new.env(); - - for (argname in names(argspec)) { - value = 0; - if (is.list(argspec[[argname]])) { - value = argspec[[argname]]$value; - } else { - value = argspec[[argname]]; - } - - assign(argname, value, envir=argsenv); - } - - if (interactive()) { - for (argname in names(argspec)) { - value = get(argname, envir=argsenv); - - if (is.na(value) | is.null(value)) { - if (exists("cmdargs")) { - assign(argname, cmdargs[[argname]], envir=argsenv); - } else { - assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv); - } - } else { - assign(argname, value, envir=argsenv); - } - } - } else { - cargs = commandArgs(TRUE); - - if (length(cargs) == 0) { - .gsa.getargs.usage(argspec, doc); - } - - for (i in 1:length(cargs)) { - if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) { - key = gsub("-", "", cargs[i]); - value = cargs[i+1]; - - if (key == "h" | key == "help") { - .gsa.getargs.usage(argspec, doc); - } - - if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) { - value = as.numeric(value); - } - - assign(key, value, envir=argsenv); - } - } - } - - args = as.list(argsenv); - - isMissingArgs = 0; - missingArgs = c(); - - for (arg in names(argspec)) { - if (is.na(args[[arg]]) | is.null(args[[arg]])) { - gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg)); - - isMissingArgs = 1; - missingArgs = c(missingArgs, arg); - } - } - - if (isMissingArgs) { - gsa.error( - paste( - "Missing required arguments: -", - paste(missingArgs, collapse=" -"), - ". Specify -h or -help to this script for a list of available arguments.", - sep="" - ) - ); - } - - args; -} diff --git a/R/src/gsalib/R/gsa.message.R b/R/src/gsalib/R/gsa.message.R deleted file mode 100644 index a2b909d3d..000000000 --- a/R/src/gsalib/R/gsa.message.R +++ /dev/null @@ -1,3 +0,0 @@ -gsa.message <- function(message) { - message(sprintf("[gsalib] %s", message)); -} diff --git a/R/src/gsalib/R/gsa.plot.venn.R b/R/src/gsalib/R/gsa.plot.venn.R deleted file mode 100644 index b1353ccc1..000000000 --- a/R/src/gsalib/R/gsa.plot.venn.R +++ /dev/null @@ -1,50 +0,0 @@ -gsa.plot.venn <- -function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, - col=c("#FF6342", "#63C6DE", "#ADDE63"), - pos=c(0.20, 0.20, 0.80, 0.82), - debug=0 - ) { - library(png); - library(graphics); - - # Set up properties - for (i in 1:length(col)) { - rgbcol = col2rgb(col[i]); - col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]); - } - - chco = paste(col[1], col[2], col[3], sep=","); - chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=","); - - props = c( - 'cht=v', - 'chs=525x525', - 'chds=0,10000000000', - paste('chco=', chco, sep=""), - paste('chd=t:', chd, sep="") - ); - proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&'); - - # Get the venn diagram (as a temporary file) - filename = tempfile("venn"); - cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep=""); - - if (debug == 1) { - print(cmd); - } - system(cmd); - - # Render the temp png file into a plotting frame - a = readPNG(filename); - - plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab=""); - if (c == 0 || a >= b) { - rasterImage(a, pos[1], pos[2], pos[3], pos[4]); - } else { - rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180); - } - - # Clean up! - unlink(filename); -} - diff --git a/R/src/gsalib/R/gsa.read.eval.R b/R/src/gsalib/R/gsa.read.eval.R deleted file mode 100644 index f1d49092b..000000000 --- a/R/src/gsalib/R/gsa.read.eval.R +++ /dev/null @@ -1,83 +0,0 @@ -.gsa.attemptToLoadFile <- function(filename) { - file = NA; - - if (file.exists(filename) & file.info(filename)$size > 500) { - file = read.csv(filename, header=TRUE, comment.char="#"); - } - - file; -} - -gsa.read.eval <- -function(evalRoot) { - fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); - fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); - fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); - fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); - fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); - fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); - fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); - fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); - fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); - fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); - fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); - fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); - fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); - fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); - - eval = list( - AlleleCountStats = NA, - CompOverlap = NA, - CountVariants = NA, - GenotypeConcordance = NA, - MetricsByAc = NA, - MetricsBySample = NA, - Quality_Metrics_by_allele_count = NA, - QualityScoreHistogram = NA, - SampleStatistics = NA, - SampleSummaryStatistics = NA, - SimpleMetricsBySample = NA, - TiTv = NA, - TiTvStats = NA, - Variant_Quality_Score = NA, - - CallsetNames = c(), - CallsetOnlyNames = c(), - CallsetFilteredNames = c() - ); - - eval$AlleleCountStats = .gsa.attemptToLoadFile(fileAlleleCountStats); - eval$CompOverlap = .gsa.attemptToLoadFile(fileCompOverlap); - eval$CountVariants = .gsa.attemptToLoadFile(fileCountVariants); - eval$GenotypeConcordance = .gsa.attemptToLoadFile(fileGenotypeConcordance); - eval$MetricsByAc = .gsa.attemptToLoadFile(fileMetricsByAc); - eval$MetricsBySample = .gsa.attemptToLoadFile(fileMetricsBySample); - eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count); - eval$QualityScoreHistogram = .gsa.attemptToLoadFile(fileQualityScoreHistogram); - eval$SampleStatistics = .gsa.attemptToLoadFile(fileSampleStatistics); - eval$SampleSummaryStatistics = .gsa.attemptToLoadFile(fileSampleSummaryStatistics); - eval$SimpleMetricsBySample = .gsa.attemptToLoadFile(fileSimpleMetricsBySample); - eval$TiTv = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); - eval$TiTvStats = .gsa.attemptToLoadFile(fileTiTvStats); - eval$Variant_Quality_Score = .gsa.attemptToLoadFile(fileVariant_Quality_Score); - - uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); - eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); - eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); - eval$CallsetFilteredNames = as.vector(c( - paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""), - paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep="")) - ); - - if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) { - eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep=""); - } - - if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) { - eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep=""); - #eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep=""); - } - - eval; -} - diff --git a/R/src/gsalib/R/gsa.read.gatkreport.R b/R/src/gsalib/R/gsa.read.gatkreport.R deleted file mode 100644 index 9b3ef1ad1..000000000 --- a/R/src/gsalib/R/gsa.read.gatkreport.R +++ /dev/null @@ -1,64 +0,0 @@ -# Load a table into the specified environment. Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last. -.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) { - d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE); - colnames(d) = tableHeader; - - for (i in 1:ncol(d)) { - v = suppressWarnings(as.numeric(d[,i])); - - if (length(na.omit(as.numeric(v))) == length(d[,i])) { - d[,i] = v; - } - } - - usedNames = ls(envir=tableEnv, pattern=tableName); - - if (length(usedNames) > 0) { - tableName = paste(tableName, ".", length(usedNames), sep=""); - } - - assign(tableName, d, envir=tableEnv); -} - -# Load all GATKReport tables from a file -gsa.read.gatkreport <- function(filename) { - con = file(filename, "r", blocking = TRUE); - lines = readLines(con); - close(con); - - tableEnv = new.env(); - - tableName = NA; - tableHeader = c(); - tableRows = c(); - - for (line in lines) { - if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { - headerFields = unlist(strsplit(line, "[[:space:]]+")); - - if (!is.na(tableName)) { - .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); - } - - tableName = headerFields[2]; - tableHeader = c(); - tableRows = c(); - } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { - # do nothing - } else if (!is.na(tableName)) { - row = unlist(strsplit(line, "[[:space:]]+")); - - if (length(tableHeader) == 0) { - tableHeader = row; - } else { - tableRows = rbind(tableRows, row); - } - } - } - - if (!is.na(tableName)) { - .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); - } - - gatkreport = as.list(tableEnv); -} diff --git a/R/src/gsalib/R/gsa.read.squidmetrics.R b/R/src/gsalib/R/gsa.read.squidmetrics.R deleted file mode 100644 index 39fa1ad32..000000000 --- a/R/src/gsalib/R/gsa.read.squidmetrics.R +++ /dev/null @@ -1,28 +0,0 @@ -gsa.read.squidmetrics = function(project, bylane = FALSE) { - suppressMessages(library(ROracle)); - - drv = dbDriver("Oracle"); - con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD"); - - if (bylane) { - statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep=""); - print(statement); - - rs = dbSendQuery(con, statement = statement); - d = fetch(rs, n=-1); - dbHasCompleted(rs); - dbClearResult(rs); - } else { - statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep=""); - print(statement); - - rs = dbSendQuery(con, statement = statement); - d = fetch(rs, n=-1); - dbHasCompleted(rs); - dbClearResult(rs); - } - - oraCloseDriver(drv); - - subset(d, Project == project); -} diff --git a/R/src/gsalib/R/gsa.warn.R b/R/src/gsalib/R/gsa.warn.R deleted file mode 100644 index 7ee08ce65..000000000 --- a/R/src/gsalib/R/gsa.warn.R +++ /dev/null @@ -1,3 +0,0 @@ -gsa.warn <- function(message) { - gsa.message(sprintf("Warning: %s", message)); -} diff --git a/R/src/gsalib/Read-and-delete-me b/R/src/gsalib/Read-and-delete-me deleted file mode 100644 index d04323a6e..000000000 --- a/R/src/gsalib/Read-and-delete-me +++ /dev/null @@ -1,9 +0,0 @@ -* Edit the help file skeletons in 'man', possibly combining help files - for multiple functions. -* Put any C/C++/Fortran code in 'src'. -* If you have compiled code, add a .First.lib() function in 'R' to load - the shared library. -* Run R CMD build to build the package tarball. -* Run R CMD check to check the package tarball. - -Read "Writing R Extensions" for more information. diff --git a/R/src/gsalib/data/tearsheetdrop.jpg b/R/src/gsalib/data/tearsheetdrop.jpg deleted file mode 100755 index c9d480fa0..000000000 Binary files a/R/src/gsalib/data/tearsheetdrop.jpg and /dev/null differ diff --git a/R/src/gsalib/man/gsa.error.Rd b/R/src/gsalib/man/gsa.error.Rd deleted file mode 100644 index df7c0cbde..000000000 --- a/R/src/gsalib/man/gsa.error.Rd +++ /dev/null @@ -1,49 +0,0 @@ -\name{gsa.error} -\alias{gsa.error} -\title{ -GSA error -} -\description{ -Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit. -} -\usage{ -gsa.error(message) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{message}{ -The error message to write. -} -} -\details{ -%% ~~ If necessary, more details than the description above ~~ -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\note{ -%% ~~further notes~~ -} - -%% ~Make other sections like Warning with \section{Warning }{....} ~ - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -gsa.error("This is a message"); -} -% Add one or more standard keywords, see file 'KEYWORDS' in the -% R documentation directory. -\keyword{ ~kwd1 } -\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsa.getargs.Rd b/R/src/gsalib/man/gsa.getargs.Rd deleted file mode 100644 index 27aa1b05a..000000000 --- a/R/src/gsalib/man/gsa.getargs.Rd +++ /dev/null @@ -1,57 +0,0 @@ -\name{gsa.getargs} -\alias{gsa.getargs} -\title{ -Get script arguments -} -\description{ -Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit. -} -\usage{ -gsa.getargs(argspec, doc = NA) -} -\arguments{ - \item{argspec}{ -A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example: -\preformatted{argspec = list( - arg1 = list(value=10, doc="Info for optional arg1"), - arg2 = list(value=NA, doc="Info for required arg2") -); -} - -If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for. -} - - \item{doc}{ -An optional string succinctly documenting the purpose of the script. -} -} -\details{ -Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes. - -To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided. -} -\value{ -Returns a list with keys matching the argspec and values representing the specified arguments. - -\item{arg1 }{Value for argument 1} -\item{arg2 }{Value for argument 2} -...etc. -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\examples{ -argspec = list( - file = list(value="/my/test.vcf", doc="VCF file"), - verbose = list(value=0, doc="If 1, set verbose mode"), - test2 = list(value=2.3e9, doc="Another argument that does stuff") -); - -cmdargs = gsa.getargs(argspec, doc="My test program"); - -print(cmdargs$file); # will print '[1] "/my/test.vcf"' -} -\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.message.Rd b/R/src/gsalib/man/gsa.message.Rd deleted file mode 100644 index 9752de8a9..000000000 --- a/R/src/gsalib/man/gsa.message.Rd +++ /dev/null @@ -1,44 +0,0 @@ -\name{gsa.message} -\alias{gsa.message} -\title{ -GSA message -} -\description{ -Write a message to standard out with the prefix '[gsalib]'. -} -\usage{ -gsa.message(message) -} -\arguments{ - \item{message}{ -The message to write. -} -} -\details{ -%% ~~ If necessary, more details than the description above ~~ -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\note{ -%% ~~further notes~~ -} - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -## Write message to stdout -gsa.message("This is a message"); -} -\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.plot.venn.Rd b/R/src/gsalib/man/gsa.plot.venn.Rd deleted file mode 100644 index bf4feb5bc..000000000 --- a/R/src/gsalib/man/gsa.plot.venn.Rd +++ /dev/null @@ -1,75 +0,0 @@ -\name{gsa.plot.venn} -\alias{gsa.plot.venn} -\title{ -Plot a proportional venn diagram -} -\description{ -Plot a proportional venn diagram (two or three-way venns allowed) -} -\usage{ -gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0) -} -\arguments{ - \item{a}{ -size of 'a' circle -} - \item{b}{ -size of 'b' circle -} - \item{c}{ -size of 'c' circle -} - \item{a_and_b}{ -size of a and b overlap -} - \item{a_and_c}{ -size of a and c overlap -} - \item{b_and_c}{ -size of b and c overlap -} - \item{col}{ -vector of colors for each venn piece -} - \item{pos}{ -vector of positional elements -} - \item{debug}{ -if 1, set debug mode and print useful information -} -} -\details{ -Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways. -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -} -\author{ -Kiran Garimella -} -\note{ -%% ~~further notes~~ -} - -%% ~Make other sections like Warning with \section{Warning }{....} ~ - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -## Plot a two-way Venn diagram -gsa.plot.venn(1000, 750, 0, 400); - -## Plot a three-way Venn diagram -gsa.plot.venn(1000, 750, 900, 400, 650, 500); -} -% Add one or more standard keywords, see file 'KEYWORDS' in the -% R documentation directory. -\keyword{ ~kwd1 } -\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsa.read.eval.Rd b/R/src/gsalib/man/gsa.read.eval.Rd deleted file mode 100644 index 0e2baba73..000000000 --- a/R/src/gsalib/man/gsa.read.eval.Rd +++ /dev/null @@ -1,111 +0,0 @@ -\name{gsa.read.eval} -\alias{gsa.read.eval} -\title{ -Read a VariantEval file -} -\description{ -Read a VariantEval file that's output in R format. -} -\usage{ -gsa.read.eval(evalRoot) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{evalRoot}{ -%% ~~Describe \code{evalRoot} here~~ -} -} -\details{ -%% ~~ If necessary, more details than the description above ~~ -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -%% ~~who you are~~ -} -\note{ -%% ~~further notes~~ -} - -%% ~Make other sections like Warning with \section{Warning }{....} ~ - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -##---- Should be DIRECTLY executable !! ---- -##-- ==> Define data, use random, -##-- or do help(data=index) for the standard data sets. - -## The function is currently defined as -function(evalRoot) { - fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); - fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); - fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); - fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); - fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); - fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); - fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); - fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); - fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); - fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); - fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); - fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); - fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); - fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); - - eval = list( - AlleleCountStats = NA, - CompOverlap = NA, - CountVariants = NA, - GenotypeConcordance = NA, - MetricsByAc = NA, - MetricsBySample = NA, - Quality_Metrics_by_allele_count = NA, - QualityScoreHistogram = NA, - SampleStatistics = NA, - SampleSummaryStatistics = NA, - SimpleMetricsBySample = NA, - TiTv = NA, - TiTvStats = NA, - Variant_Quality_Score = NA, - - CallsetNames = c(), - CallsetOnlyNames = c(), - CallsetFilteredNames = c() - ); - - eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); - eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); - eval$CountVariants = .attemptToLoadFile(fileCountVariants); - eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); - eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); - eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); - eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); - eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); - eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); - eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); - eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); - eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); - eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); - eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); - - uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); - eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); - eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); - eval$CallsetFilteredNames = as.vector(c()); - eval; - } -} -% Add one or more standard keywords, see file 'KEYWORDS' in the -% R documentation directory. -\keyword{ ~kwd1 } -\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsa.read.gatkreport.Rd b/R/src/gsalib/man/gsa.read.gatkreport.Rd deleted file mode 100644 index 67c2c7b28..000000000 --- a/R/src/gsalib/man/gsa.read.gatkreport.Rd +++ /dev/null @@ -1,55 +0,0 @@ -\name{gsa.read.gatkreport} -\alias{gsa.read.gatkreport} -\title{ -gsa.read.gatkreport -} -\description{ -Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list. -} -\usage{ -gsa.read.gatkreport(filename) -} -\arguments{ - \item{filename}{ -The path to the GATKReport file. -} -} -\details{ -The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function. - -The file format looks like this: -\preformatted{##:GATKReport.v0.1 TableName : The description of the table -col1 col2 col3 -0 0.007451835696110506 25.474613284804366 -1 0.002362777171937477 29.844949954504095 -2 9.087604507451836E-4 32.87590975254731 -3 5.452562704471102E-4 34.498999090081895 -4 9.087604507451836E-4 35.14831665150137 -} - -} -\value{ -Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN". -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\note{ -%% ~~further notes~~ -} - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -report = gsa.read.gatkreport("/path/to/my/output.gatkreport"); -} -\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.read.squidmetrics.Rd b/R/src/gsalib/man/gsa.read.squidmetrics.Rd deleted file mode 100644 index 0a8b37843..000000000 --- a/R/src/gsalib/man/gsa.read.squidmetrics.Rd +++ /dev/null @@ -1,48 +0,0 @@ -\name{gsa.read.squidmetrics} -\alias{gsa.read.squidmetrics} -\title{ -gsa.read.squidmetrics -} -\description{ -Reads metrics for a specified SQUID project into a dataframe. -} -\usage{ -gsa.read.squidmetrics("C315") -} -\arguments{ - \item{project}{ -The project for which metrics should be obtained. -} - \item{bylane}{ -If TRUE, obtains per-lane metrics rather than the default per-sample metrics. -} -} -\details{ -%% ~~ If necessary, more details than the description above ~~ -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -Returns a data frame with samples (or lanes) as the row and the metric as the column. -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\note{ -This method will only work within the Broad Institute internal network. -} - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -## Obtain metrics for project C315. -d = gsa.read.squidmetrics("C315"); -} -\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.warn.Rd b/R/src/gsalib/man/gsa.warn.Rd deleted file mode 100644 index 0b9770b5c..000000000 --- a/R/src/gsalib/man/gsa.warn.Rd +++ /dev/null @@ -1,46 +0,0 @@ -\name{gsa.warn} -\alias{gsa.warn} -\title{ -GSA warn -} -\description{ -Write a warning message to standard out with the prefix '[gsalib] Warning:'. -} -\usage{ -gsa.warn(message) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{message}{ -The warning message to write. -} -} -\details{ -%% ~~ If necessary, more details than the description above ~~ -} -\value{ -%% ~Describe the value returned -%% If it is a LIST, use -%% \item{comp1 }{Description of 'comp1'} -%% \item{comp2 }{Description of 'comp2'} -%% ... -} -\references{ -%% ~put references to the literature/web site here ~ -} -\author{ -Kiran Garimella -} -\note{ -%% ~~further notes~~ -} - -\seealso{ -%% ~~objects to See Also as \code{\link{help}}, ~~~ -} -\examples{ -## Write message to stdout -gsa.warn("This is a warning message"); -} -\keyword{ ~kwd1 } -\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsalib-package.Rd b/R/src/gsalib/man/gsalib-package.Rd deleted file mode 100644 index 2b8d6db9f..000000000 --- a/R/src/gsalib/man/gsalib-package.Rd +++ /dev/null @@ -1,68 +0,0 @@ -\name{gsalib-package} -\alias{gsalib-package} -\alias{gsalib} -\docType{package} -\title{ -GATK utility analysis functions -} -\description{ -Utility functions for analyzing GATK-processed NGS data -} -\details{ -This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more. -} -\author{ -Genome Sequencing and Analysis Group - -Medical and Population Genetics Program - -Maintainer: Kiran Garimella -} -\references{ -GSA wiki page: http://www.broadinstitute.org/gsa/wiki - -GATK help forum: http://www.getsatisfaction.com/gsa -} -\examples{ -## get script arguments in interactive and non-interactive mode -cmdargs = gsa.getargs( list( - requiredArg1 = list( - value = NA, - doc = "Documentation for requiredArg1" - ), - - optionalArg1 = list( - value = 3e9, - doc = "Documentation for optionalArg1" - ) -) ); - -## plot a proportional Venn diagram -gsa.plot.venn(500, 250, 0, 100); - -## read a GATKReport file -report = gsa.gatk.report("/path/to/my/output.gatkreport"); - -## emit a message -gsa.message("This is a message"); - -## emit a warning message -gsa.message("This is a warning message"); - -## emit an error message -gsa.message("This is an error message"); - -## read the SQUID metrics for a given sequencing project (internal to the Broad only) -s = gsa.read.squidmetrics("C427"); - -## read command-line arguments -cmdargs = gsa.getargs( - list( - file = list(value="/my/test.vcf", doc="VCF file"), - verbose = list(value=0, doc="If 1, set verbose mode"), - test2 = list(value=2.3e9, doc="Another argument that does stuff") - ), - doc="My test program" -); -} -\keyword{ package } diff --git a/R/tearsheet.r b/R/tearsheet.r deleted file mode 100755 index 47bbe093c..000000000 --- a/R/tearsheet.r +++ /dev/null @@ -1,245 +0,0 @@ -#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively. -#testcomment -args<-commandArgs(TRUE) -lanes<-args[1] -samples<-args[2] -sample_sets<-args[3] -eval<-args[4] -noveltitv<-args[5] -knowntitv<-args[6] -DOC<-args[7] - -if(is.na(sample_sets)){ - print("Please specify sample set for file naming and press enter.") - scan("stdin", what="character",n=1)->sample_sets - print("Thanks!") - } - - if(is.na(lanes) == FALSE && is.na(samples)==FALSE){ - #this makes a table & graphs using Picard data - read.delim(file=lanes, header= TRUE)->bylane; - read.delim(file=samples, header= TRUE)->bysample; - - #Calc by lane metrics - attach(bylane); - callable.target<-HS_TARGET_TERRITORY[1]; - singlelanes<-length(which(Lane.Type=="Single")); - pairedlanes<-length(which(Lane.Type=="Paired")); - mean.read.lane<-signif(mean(AL_TOTAL_READS, na.rm=TRUE)); - sd.read.lane<-signif(sd(AL_TOTAL_READS, na.rm=TRUE)); - mean.ub.lane<-signif(mean(HS_ON_TARGET_BASES, na.rm=TRUE)); - sd.ub.lane<-signif(sd(HS_ON_TARGET_BASES, na.rm=TRUE)); - mean.cov.lane<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - sd.cov.lane<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE)); - mean.10x.lane<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - mean.20x.lane<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - mean.30x.lane<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)); - sd.10x.lane<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE)); - sd.20x.lane<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); - sd.30x.lane<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)); - - - names<-paste(Project, " ", External.ID, "-", Lane, sep="") - - #makes a plot of the number of SNPS called per lane - library(graphics) - - pdf(file=paste(sample_sets, "_SNPS.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS)) - - layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) - plot(1:length(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", SNP_TOTAL_SNPS, xlab="", ylab="SNPs Called in Lane", xaxt="n", pch=16, col="blue") - axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2) - - boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called") - - - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){ - mtext("No outliers", side=1, line=4) - }else{ - mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) - } - - - dev.off() - - #makes SNP plot in log scale - - pdf(file=paste(sample_sets, "_SNPS_log.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS)) - - layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) - plot(1:length(SNP_TOTAL_SNPS), log(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue") - par(ylog=TRUE) - axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2) - - boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called") - - - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){ - mtext("No outliers", side=1, line=4) - }else{ - mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) - } - - dev.off() - - #makes a plot of snp calls ordered by lane - pdf(file=paste(sample_sets, "_SNPS_lane.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS)) - - layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) - plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Lane)], main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue") - par(ylog=TRUE) - axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names[order(Lane)], cex.axis=0.75, las=2) - - boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called") - - - if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){ - mtext("No outliers", side=1, line=4) - }else{ - mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) - } - - dev.off() - - #makes a plot of fingerprint calls and labels them good or bad - badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15)) - - colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS))) - colors[badsnps]<-"Red" - - pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS)) - par(mar=c(6, 4, 5, 4)) - plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=16, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching") - points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, col=colors) - axis(side=1, at=(1:length(FP_CONFIDENT_CALLS)), labels=names, cex.axis=0.75, las=2) - - if(length(badsnps)>0){ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes"), pch=c(1, 16, 1, 16), col=c("Blue", "Blue", "Red", "Red")) - mtext("Some problematic fingerprint sites", side=3) - }else{ - legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane"), pch=c(1, 16), col="Blue") - } - - dev.off() - - detach(bylane) - - #Calc by sample metrics - attach(bysample); - mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE)); - sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE)); - mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE)); - sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE)); - mean.read.samp<-signif(mean(Total.Reads, na.rm=TRUE)); - sd.read.samp<-signif(sd(Total.Reads, na.rm=TRUE)); - mean.ub.samp<-signif(mean(On.Target.Bases..HS., na.rm=TRUE)); - sd.ub.samp<-signif(sd(On.Target.Bases..HS., na.rm=TRUE)); - mean.cov.samp<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE)); - sd.cov.samp<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE)); - mean.10x.samp<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - mean.20x.samp<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - mean.30x.samp<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - sd.10x.samp<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE)); - sd.20x.samp<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE)); - sd.30x.samp<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE)); - - detach(bysample); - - #print all of this stuff out in R. - print(paste("Callable Target: ", callable.target, " bases", sep=""), quote = FALSE); - print(paste("Used Lanes per Sample: ", mean.lanes.samp, " +/- ", sd.lanes.samp, sep=""), quote=FALSE); - print(paste("Parities: ", singlelanes, " single lanes, ", pairedlanes, " paired lanes", sep=""), quote=FALSE); - print(paste("Read Legnths: ", mean.mrl.samp, " +/- ", sd.mrl.samp, sep=""), quote = FALSE); - print(paste("Reads per lane: ", mean.read.lane, " +/- ", sd.read.lane, sep=""), quote = FALSE); - print(paste("Reads per sample: ", mean.read.samp, " +/- ", sd.read.samp, sep=""), quote = FALSE); - print(paste("Used bases per lane: ", mean.ub.lane, " +/- ", sd.ub.lane, sep=""), quote = FALSE); - print(paste("Used bases per sample: ", mean.ub.samp, " +/- ", sd.ub.samp, sep=""), quote = FALSE) - print(paste("Average target coverage per lane: ", mean.cov.lane, " +/- ", sd.cov.lane, sep=""), quote = FALSE); - print(paste("Average target coverage per sample: ", mean.cov.samp, " +/- ", sd.cov.samp, sep=""), quote = FALSE); - print(paste("% loci covered to 10x per lane: ", mean.10x.lane, "% +/- ", sd.10x.lane, "%", sep=""), quote = FALSE) - print(paste("% loci covered to 10x per sample: ", mean.10x.samp, " +/- ", sd.10x.samp, "%", sep=""), quote = FALSE) - print(paste("% loci covered to 20x per lane: ", mean.20x.lane, "% +/- ", sd.20x.lane, "%", sep=""), quote = FALSE) - print(paste("% loci covered to 20x per sample: ", mean.20x.samp, "% +/- ", sd.20x.samp, "%", sep=""), quote = FALSE) - print(paste("% loci covered to 30x per lane: ", mean.30x.lane, "% +/- ", sd.30x.lane, "%", sep=""), quote = FALSE) - print(paste("% loci covered to 30x per sample: ", mean.30x.samp, "% +/- ", sd.30x.samp, "%", sep=""), quote = FALSE) - - }else{ - print("Lane and Sample metrics file paths not provided") - } - - - - #Makes Error Rate percycle graph - if(is.na(eval)==FALSE){ - read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle - - pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5) - - crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes - - colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5) - colors[crazies]<-rainbow(length(crazies)) - weights<-rep(1, ncol(errpercycle)) - weights[crazies]<-2 - - matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7)) - - if(length(crazies)>0){ - legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5) - }else{ - legend("topleft", legend="No unusual lanes.", bty="n") - } - - dev.off() - - }else{ - print("Error Rate Per Cycle file paths not provided") - } - - #Makes TI/TV known v novel graph - if(is.na(noveltitv)==FALSE && is.na(knowntitv) == FALSE){ - pdf(paste(sample_set, "_TiTv.pdf", sep=""), width=6, height=5) - - read.table(file=noveltitv, header=FALSE)->novels - read.table(file=knowntitv, header=FALSE)->knowns - - plot(novels[,2], col="red", ylim=c(0, 3.5), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", xaxt="n") - points(knowns[,2], col="blue") - - axis(side=1, at=(1:length(novels[,2])), labels=novels[,1], cex.axis=1, las=2) - - legend("bottomright", legend=c("Known Variants", "Novel Variants"), col=c("blue", "red"), pch=1, xjust=0.5) - mtext("Lower Ti/Tv ratios indicated more false positive SNP calls.", side=1) - dev.off() - }else{ - print("Transition/transversion ratio file paths not provided") - - } - - #Make DOC graph - if(is.na(DOC)==FALSE){ - pdf(paste(sample_set, "_DOC.pdf", sep=""), width=6, height=5) - - as.matrix(as.vector(read.delim(DOC, header=TRUE)[,2:502]))->DOCdata - DOCdata<-matrix(DOCdata*100/sum(DOCdata[1,]), nrow=501, ncol=29, byrow=TRUE) - colnames(DOCdata)<-read.delim(DOC, header=TRUE)[,1] - oddies<-which(apply(DOCdata, 2, max)>10) #can be assigned any particular heuristic - ncolors<-rainbow(ncol(DOCdata), s=0.5, v=0.5) - ncolors[oddies]<-rainbow(length(oddies)) - nweights<-rep(1, ncol(DOCdata)) - nweights[oddies]<-2 - matplot(DOCdata, type="l", main="Depth of Coverage by Sample", ylab="Percent bases covered to a given depth", xlab="log(Depth)", log="x", col=ncolors, lty="solid", lwd=nweights) - - if(length(oddies)>0){ - legend("topright", title="Unusual Cases", legend=colnames(DOCdata)[oddies], lty="solid", lwd=2, col=ncolors[oddies], xjust=0.5) - }else{ - legend("topright", legend="No unusual cases.", bty="n") - } - - dev.off() - - }else{ - print("Depth of Coverage filepath not provided") - } - - diff --git a/R/titvFPEst.R b/R/titvFPEst.R deleted file mode 100755 index 7af5e8bbb..000000000 --- a/R/titvFPEst.R +++ /dev/null @@ -1,138 +0,0 @@ -titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } - -titvFPEstV <- function(titvExpected, titvs) { - sapply(titvs, function(x) titvFPEst(titvExpected, x)) -} - -calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { - TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel - 2 * TP / 3 / callable -} - -marginalTiTv <- function( nx, titvx, ny, titvy ) { - tvx = nx / (titvx + 1) - tix = nx - tvx - tvy = ny / (titvy + 1) - tiy = ny - tvy - tiz = tix - tiy - tvz = tvx - tvy - return(tiz / tvz) -} -marginaldbSNPRate <- function( nx, dbx, ny, dby ) { - knownx = nx * dbx / 100 - novelx = nx - knownx - knowny = ny * dby / 100 - novely = ny - knowny - knownz = knownx - knowny - novelz = novelx - novely - return(knownz / ( knownz + novelz ) * 100) -} - -numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { - nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) - return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) -} - -normalize <- function(x) { - x / sum(x) -} - -normcumsum <- function(x) { - cumsum(normalize(x)) -} - -cumhist <- function(d, ...) { - plot(d[order(d)], type="b", col="orange", lwd=2, ...) -} - -revcumsum <- function(x) { - return(rev(cumsum(rev(x)))) -} - -phred <- function(x) { - log10(max(x,10^(-9.9)))*-10 -} - -pOfB <- function(b, B, Q) { - #print(paste(b, B, Q)) - p = 1 - 10^(-Q/10) - if ( b == B ) - return(p) - else - return(1 - p) -} - -pOfG <- function(bs, qs, G) { - a1 = G[1] - a2 = G[2] - - log10p = 0 - for ( i in 1:length(bs) ) { - b = bs[i] - q = qs[i] - p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 - log10p = log10p + log10(p1) - } - - return(log10p) -} - -pOfGs <- function(nAs, nBs, Q) { - bs = c(rep("a", nAs), rep("t", nBs)) - qs = rep(Q, nAs + nBs) - G1 = c("a", "a") - G2 = c("a", "t") - G3 = c("t", "t") - - log10p1 = pOfG(bs, qs, G1) - log10p2 = pOfG(bs, qs, G2) - log10p3 = pOfG(bs, qs, G3) - Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) - - return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) -} - -QsampleExpected <- function(depth, Q) { - weightedAvg = 0 - for ( d in 1:(depth*3) ) { - Qsample = 0 - pOfD = dpois(d, depth) - for ( nBs in 0:d ) { - pOfnB = dbinom(nBs, d, 0.5) - nAs = d - nBs - Qsample = pOfGs(nAs, nBs, Q)$Qsample - #Qsample = 1 - weightedAvg = weightedAvg + Qsample * pOfD * pOfnB - print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) - } - } - - return(weightedAvg) -} - -plotQsamples <- function(depths, Qs, Qmax) { - cols = rainbow(length(Qs)) - plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") - - for ( i in 1:length(Qs) ) { - Q = Qs[i] - y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) - points(depths, y, col=cols[i], type="b") - } - - legend("topleft", paste("Q", Qs), fill=cols) -} - -pCallHetGivenDepth <- function(depth, nallelesToCall) { - depths = 0:(2*depth) - pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) - dpois(depths,depth)*(1-pNoAllelesToCall) -} - -pCallHets <- function(depth, nallelesToCall) { - sum(pCallHetGivenDepth(depth,nallelesToCall)) -} - -pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { - 1-(1-pCallHets(depth,nallelesToCall))^nsamples -} diff --git a/R/whole_exome_bait_selection.R b/R/whole_exome_bait_selection.R deleted file mode 100755 index fb34a2d1d..000000000 --- a/R/whole_exome_bait_selection.R +++ /dev/null @@ -1,120 +0,0 @@ -count_zeros = function(list) { - zeros = 0 - for (x in list) { - if (x == 0.0) { - zeros = zeros + 1 - } - } - zeros -} - - -load = function(max_rows) { - files = list.files(path=".", pattern="304NA.*") - #max_rows = -1 - - #FREESTANDING as a filter - #HIT_TWICE for ZEROS... - - print ("Parsing file 1") - t = read.table(files[1],header=T, nrows = max_rows) - f = data.frame(loc=t$location,gc=t$gc,freestanding=t$freestanding) - ht = data.frame(1:nrow(f)) - - for (file in files) { - print (file) - t = read.table(file, header=T, nrows = max_rows) - norm_cov = t$normalized_coverage - #names(norm_cov) = c("norm_cov.1") - f=cbind (f, norm_cov) - ht=cbind (ht, t$hit_twice) - } - - - wgs = read.table("/seq/dirseq/analysis/agilent/rt-pcr/perfdata//OV-0751-WGS.baits.coverage.txt", header=T, nrows = max_rows) - f=cbind (f, wgs_norm_cov = wgs$normalized_coverage) - - f=cbind(f,ht) - - # Compute normalized variance - print("Calculating variance") - var = apply(f[4:10], 1, var) - print("Calculating std. dev.") - sd = apply(f[4:10], 1, sd) - print("Calculating mean") - mean = apply(f[4:10], 1, mean) - print("Binding normalized variance") - f=cbind (f, normvar=var/mean/mean) - print("Binding normalized std. dev.") - f=cbind (f, normsd=sd/mean) - print("Binding mean") - f=cbind (f, mean=mean) - print("Binding std. dev.") - f=cbind (f, sd=sd) - print("Binding variance") - f=cbind (f, var=var) - - print("Calculating and binding number of zeros") - count_zeros = apply(f[4:10], 1, count_zeros) - num_not_hit_twice = apply(f[12:18], 1, count_zeros) - f=cbind(f, count_zeros, num_not_hit_twice) - - print ("Parsing sequences file") - seqs = read.table("whole_exome_agilent_designed_120.design.1line.sorted2",header=T,nrows=max_rows) - f=cbind (f, seqs) - - #of = f[order(f$normvar),] -} - -write_splits = function(f) { - set.seed(0987123409) - - # Low variance - nz = f[f$count_zeros < 1 & f$freestanding==1,] # Take reads with no zeros - d = write_split(nz, "Low_GC_Norm_Coverage", 0.0, 0.35, 0.8, 1.2, 0.0, 0.3, 0.0) - d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage", 0.45, 0.55, 0.8, 1.2, 0.0, 0.1, 0.0)) - d = rbind(d,write_split(nz, "High_GC_Norm_Coverage", 0.63, 1.0, 0.8, 1.2, 0.0, 0.3, 0.0)) - d = rbind(d,write_split(nz, "Low_GC_Undercovered", 0.0, 0.35, 0.2, 0.3, 0.0, 0.3, 0.0)) - d = rbind(d,write_split(nz, "Mid_GC_Undercovered", 0.45, 0.55, 0.2, 0.3, 0.0, 0.3, 0.0)) - d = rbind(d,write_split(nz, "High_GC_Undercovored", 0.63, 1.0, 0.2, 0.3, 0.0, 0.3, 0.0)) - az = f[f$count_zeros == 7 & f$freestanding==1,] # Take reads with all zeros - d = rbind(d,write_split(az, "Low_GC_No_Coverage", 0.0, 0.35, 0.0, 0.1, -1.0, -1.0, 0.1)) - d = rbind(d,write_split(az, "Mid_GC_No_Coverage", 0.45, 0.55, 0.0, 0.1, -1.0, -1.0, 0.1)) - d = rbind(d,write_split(az, "High_GC_No_Coverage", 0.63, 1.0, 0.0, 0.1, -1.0, -1.0, 0.01)) - - # High variance - d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage_High_Variation", 0.45, 0.55, 0.8, 1.2, 0.355, 1000.0)) - d -} - -write_split = function(data, label, gc_low, gc_high, cov_low, cov_high, normsd_low, normsd_high, wgs_cov_low) { - if (normsd_high < 0.0) { - # We have no coverage samples - s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$wgs_norm_cov >= wgs_cov_low,] - #s = s[order(runif(nrow(s))),] # Randomize rows - s = s[order(s$wgs_norm_cov, decreasing = T),] # order according to norm SD - }else{ - # We have low or normal coverage samples, so take those with tightest norm SDs - s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$normsd >= normsd_low & data$normsd <= normsd_high ,] - s = s[order(s$normsd),] # order according to norm SD - } - # & data$mean < 1.1 & data$mean > 0.9,] - # & data$mean >= cov_low & data$mean <= cov_high - #print(s) - print(nrow(s)) - s = s[1:50, ] #-c(3,11,12:18,19,23:25)] - s = cbind(class=rep(label,50), s) - s -} - -#f=load() -#nz=f[f$count_zeros < 1,] -#print(summary(nz)) - -create_500 = function(f) { - - f = load(-1) - s = write_splits(f) - write.csv(s, "500_exome_baits_for_nanostring.csv") - -} diff --git a/analysis/depristo/distributedGATK/commands.R b/analysis/depristo/distributedGATK/commands.R deleted file mode 100644 index c3d97277e..000000000 --- a/analysis/depristo/distributedGATK/commands.R +++ /dev/null @@ -1,27 +0,0 @@ -plot1 <- function(d, name) { - d = subset(d, dataset == name) - subd = data.frame(parallel.type=d$parallel.type, nWaysParallel=d$nWaysParallel, end.to.end.time=d$end.to.end.time,per.1M.sites = d$per.1M.sites, job.run.time = d$job.run.time) - - nways = unique(subd$nWaysParallel) - m = max(subset(subd, nWaysParallel == min(nways))$end.to.end.time) - nNW = subset(subd, end.to.end.time == m)$nWaysParallel[1] - timeAt1 = m * nNW - my.runtime = subset(subd, end.to.end.time == m)$job.run.time[1] * nNW - my.pms = subset(subd, end.to.end.time == m)$per.1M.sites[1] - - theo = data.frame(parallel.type="theoretic", end.to.end.time=timeAt1/nways, nWaysParallel=nways, per.1M.sites = my.pms, job.run.time = my.runtime / nways) - - subd = rbind(subd, theo) - - print(summary(subd)) - - print(xyplot(log10(end.to.end.time) + per.1M.sites + log10(job.run.time) ~ log2(nWaysParallel), data=subd[order(subd$nWaysParallel),], group=parallel.type, type="b", outer=T, scale=list(relation="free"), auto.key=T, lwd=c(2,2,1), main=name)) - - return(subd) -} - -myData <- read.table("results.new.dat", header=T) -require("lattice") - -for (name in unique(d$dataset)) - plot1(myData, name) diff --git a/analysis/depristo/distributedGATK/distributedGATKMetrics.py b/analysis/depristo/distributedGATK/distributedGATKMetrics.py deleted file mode 100755 index ed8beedec..000000000 --- a/analysis/depristo/distributedGATK/distributedGATKMetrics.py +++ /dev/null @@ -1,121 +0,0 @@ -import sys -from optparse import OptionParser -from itertools import * -import random -import re -import datetime - -# a simple script that does: -# 1 -- generates a master set of variants following the neutral expectation from a single big population -# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants -# 3 -- writes out the genotypes of these individuals, and their allele frequency -def main(): - global OPTIONS - usage = "usage: %prog [options] outputFile" - parser = OptionParser(usage=usage) - - (OPTIONS, args) = parser.parse_args() - if len(args) == 0: - parser.error("Requires at least one argument") - - print 'file dataset parallel.type nWaysParallel start.time end.time end.to.end.time per.1M.sites job.run.time' - typere = '.*/(.*).ptype_(\w+).nways_(\d+).*' - for file in args: - startTime, endTime, perMSites, runtime = None, None, None, None - for line in open(file): - match = re.match(typere, line) - if match != None: dataset, parallelType, nWays = match.groups() - startTime = captureStartTime(line, startTime) - perMSites = capturePerMSites(line, perMSites) - endTime = captureEndTime(line, endTime) - runtime = captureRuntime(line, runtime) - print file, dataset, parallelType, nWays, formatTime(startTime), formatTime(endTime), endToEnd(endTime, startTime), perMSites, runtime - -def endToEnd(endTime, startTime): - if endTime < startTime: - endTime = endTime + datetime.timedelta(1) - #print 'endToEnd', endTime, startTime - return total_minutes(endTime - startTime) - -def formatTime(t): - return datetime.datetime.strftime(t, formatString) - -def total_minutes(td): - return td.days * 24 * 60 + td.seconds / 60.0 - -def captureLine(line, regex, func, prevValue): - match = regex.match(line) - if match != None: - if func != None: - val = func(line) - else: - val = match.group(1) - else: - val = None - #print 'Matching', line, regex, match, prevValue, val - - return val - -formatString = "%H:%M:%S" - -def captureStartTime(line, prev): - # todo - needs to find the earliest time - #INFO 11:03:50,202 HelpFormatter - The Genome Analysis Toolkit (GATK) v, Compiled - regex = re.compile("INFO\W*(\d+:\d+:\d+).*The Genome Analysis Toolkit.*") - return selectTime(captureLine(line, regex, None, prev), prev, earlier = True) - -def selectTime(newTimeString, oldTime, earlier = False): - def select(): - if newTimeString == None: - return oldTime - else: - newTime = datetime.datetime.strptime(newTimeString, formatString) - if oldTime == None: - return newTime - elif earlier: - if newTime < oldTime: - return newTime - else: - return oldTime - else: - if newTime > oldTime: - return newTime - else: - return oldTime - r = select() - #if not earlier: print 'selectTime', oldTime, newTimeString, r - return r - - -def captureEndTime(line, prev): - # todo - needs to find the latest time - regex = re.compile("INFO\W*(\d+:\d+:\d+).*GATKRunReport - Aggregating data for run report.*") - return selectTime(captureLine(line, regex, None, prev), prev, earlier=False) - -unitsToMinutes = { - 'm' : 1.0, - 'h' : 60, - 's' : 1.0/60, - 'd' : 60 * 60 - } - -def capturePerMSites(line, prev): - return captureDoneLine(line, prev, 8, 10) - -def captureRuntime(line, prev): - return captureDoneLine(line, prev, 6, 8) - -def captureDoneLine(line, prev, s, e): - # INFO 11:04:11,541 TraversalEngine - chr1:3769010 1.32e+05 20.0 s 2.5 m 1.5% 21.9 m 21.5 m - regex = re.compile("INFO .*TraversalEngine -.*done*") - val = captureLine(line, regex, lambda x: x.split()[s:e], None) - if val == None: - return prev - else: - x, u = val - return float(x) * unitsToMinutes[u] - - - -if __name__ == "__main__": - main() diff --git a/analysis/depristo/distributedGATK/distributedGATKPerformance.scala b/analysis/depristo/distributedGATK/distributedGATKPerformance.scala deleted file mode 100755 index 8ca54fbe7..000000000 --- a/analysis/depristo/distributedGATK/distributedGATKPerformance.scala +++ /dev/null @@ -1,200 +0,0 @@ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.QScript -import org.apache.commons.io.FilenameUtils; - -class DistributedGATKPerformance extends QScript { - qscript => - - @Argument(shortName="gatk", doc="gatk jar file", required=true) - var gatkJarFile: File = _ - - @Argument(shortName="outputDir", doc="output directory", required=false) - var outputDir: String = "" - - @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) - var datasets: List[String] = Nil - - @Argument(shortName="waysParallel", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) - var waysParallelArg: List[Int] = Nil - - @Argument(shortName="long", doc="runs long calculations", required=false) - var long: Boolean = false - - @Argument(shortName="test", doc="runs long calculations", required=false) - var test: Boolean = false - - @Argument(shortName="limitTo30Min", doc="runs long calculations", required=false) - var limitTo30Min: Boolean = false - - @Argument(shortName="huge", doc="runs long calculations", required=false) - var huge: Int = -1 - - @Argument(shortName="justDist", doc="runs long calculations", required=false) - var justDist: Boolean = false - - @Argument(shortName="justSG", doc="runs long calculations", required=false) - var justSG: Boolean = false - - @Argument(shortName="trackerDir", doc="root directory for distributed tracker files", required=false) - var trackerDir: String = "" // "/humgen/gsa-scr1/depristo/tmp/" - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "DEBUG"; jarFile = gatkJarFile; memoryLimit = 2; } - - class Target( - val baseName: String, - val reference: File, - val dbsnpFile: String, - val hapmapFile: String, - val maskFile: String, - val bamList: File, - val goldStandard_VCF: File, - val intervals: String, - val titvTarget: Double, - val isLowpass: Boolean, - val useBAQ: Boolean) { - val name = qscript.outputDir + baseName - val clusterFile = new File(name + ".clusters") - def rawVCF(part: String) = new File(name + "." + part + ".raw.vcf") - val filteredVCF = new File(name + ".filtered.vcf") - val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf") - val tsRecalibratedVCF = new File(name + ".ts.recalibrated.vcf") - val goldStandardName = qscript.outputDir + "goldStandard/" + baseName - val goldStandardClusterFile = new File(goldStandardName + ".clusters") - } - - val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") - val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod" - val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod" - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" - val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf" - val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf" - val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" - val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" - val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - val lowPass: Boolean = true - - val targetDataSets: Map[String, Target] = Map( - "HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.hg18.intervals", 2.07, !lowPass, true), - "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true), - "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass, true), - "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/Barcoded_1000G_WEx_Reduced_Plate_1.20.cleaned.list"), // BUGBUG: reduce from 60 to 20 people - new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true), - "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, - new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from - new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass,true), // chunked interval list to use with Queue's scatter/gather functionality - "LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large - new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true), - "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass,false), - "WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bams.list"), - new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true) - ) - - def getTargetInterval(target: Target): List[String] = target.name match { - case "NA12878.HiSeq" => List("chr1") - case "FIN" => List("20") - case "ALL.august.v4" => List("20") - case "EUR.nov2010" => List("20") - case _ => List(target.intervals) - } - - def script = { - - // Selects the datasets in the -dataset argument and adds them to targets. - var targets: List[Target] = List() - if (!datasets.isEmpty) - for (ds <- datasets) - targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way? - else // If -dataset is not specified, all datasets are used. - for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values - targets ::= targetDS - - val nWays = if ( test ) List(32) else { if ( long ) List(1,2,4,8) else if ( huge != -1 ) List(huge) else List(16,32,64,128) } - //val nWays = List(2) - - for (target <- targets) { - for ( scatterP <- if ( test ) List(false) else if ( justSG ) List(true) else if ( justDist ) List(false) else List(true, false) ) - for (nWaysParallel <- nWays ) { - val aname = "ptype_%s.nways_%d".format(if ( scatterP ) "sg" else "dist", nWaysParallel) - - def addUG(ug: UnifiedGenotyper) = { - if ( ! long ) - ug.jobLimitSeconds = 60 * 60 * 4 - if ( limitTo30Min ) - ug.jobLimitSeconds = 60 * 30 - add(ug); - } - - // add scatter/gather or distributed parallelism - if ( scatterP ) { - var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname) - ug.scatterCount = nWaysParallel - ug.intervalsString ++= List(target.intervals) - addUG(ug) - } else { - for ( part <- 1 to nWaysParallel) { - var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname + ".part" + part) - ug.intervalsString ++= getTargetInterval(target) - ug.processingTracker = new File(trackerDir + target.name + "." + aname + ".distributed.txt") - ug.processingTrackerID = part - if ( part == 1 ) - ug.performanceLog = new File("%s.%s.pf.log".format(target.name, aname)) - ug.processingTrackerStatusFile = new File("%s.%s.%d.ptstatus.log".format(target.name, aname, part)) - addUG(ug) - } - } - - } - } - } - - // 1.) Call SNPs with UG - class UnifiedGenotyper(t: Target, aname: String) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - this.reference_sequence = t.reference - this.dcov = if ( t.isLowpass ) { 50 } else { 250 } - this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.input_file :+= t.bamList - this.out = t.rawVCF(aname) - this.baq = if (t.useBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} - this.analysisName = t.name + "_UG." + aname - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) - } -} diff --git a/analysis/depristo/distributedGATK/fileBackedGLPperformance.R b/analysis/depristo/distributedGATK/fileBackedGLPperformance.R deleted file mode 100644 index cf50b1df1..000000000 --- a/analysis/depristo/distributedGATK/fileBackedGLPperformance.R +++ /dev/null @@ -1,3 +0,0 @@ -#d <- read.table("../GATK/trunk/timer.dat", header=T) -require("lattice") -print(xyplot(elapsed.time + delta ~ cycle | name, data=d, scales=list(relation="free"), auto.key=T, type="b", outer=T)) \ No newline at end of file diff --git a/analysis/depristo/distributedGATK/getToTime.csh b/analysis/depristo/distributedGATK/getToTime.csh deleted file mode 100755 index bdfe447f5..000000000 --- a/analysis/depristo/distributedGATK/getToTime.csh +++ /dev/null @@ -1 +0,0 @@ -grep -l -e "ptype_sg" -e "part1\." short/Q-*.out long/Q-*.out > toTime.txt diff --git a/analysis/depristo/distributedGATK/makeChunks.csh b/analysis/depristo/distributedGATK/makeChunks.csh deleted file mode 100644 index 1493ce0d9..000000000 --- a/analysis/depristo/distributedGATK/makeChunks.csh +++ /dev/null @@ -1 +0,0 @@ -echo "63025520" | awk '{ for(i = 0; i < $1; i += 100000) {print "20:" i+1 "-" (i+100000 < $1 ? i+100000 : $1)}}' > whole_genome_chunked.chr20.hg19.intervals diff --git a/analysis/depristo/distributedGATK/model.R b/analysis/depristo/distributedGATK/model.R deleted file mode 100644 index 572edb596..000000000 --- a/analysis/depristo/distributedGATK/model.R +++ /dev/null @@ -1,34 +0,0 @@ -JOB_START_RATE = 0.1 # chance of starting is 0.1 -WORK_UNITS = 100 -WORK_RATE = 1 -N_TICKS = 300 - -ticks <- 1:N_TICKS - -# the probability that a job starts at exactly tick i -pThreadStartAtTick <- function(i) { - dexp(i, JOB_START_RATE) -} - -jobDoneByI <- function(i) { - return(sapply(i - ticks, function(x) max(x, 0)) * WORK_RATE) - #return(pCompleteAtI(i, pStarts, ticks)) -} - -pThreadDoneByI <- function(i) { - pStarts <- pThreadStartAtTick(ticks) - workDoneByThreadStartingAtI <- jobDoneByI(i) - fracDone <- workDoneByThreadStartingAtI / WORK_UNITS - doneAtI <- fracDone >= 1 - return(sum(pStarts * doneAtI)) -} - -pThreadsDoneByI <- function(i, nThreads) { - pDone <- rep(0, N_TICKS) - for ( thread : 1:nThreads ) - pDone <- pPrevThreadsNotDoneAtI(pDone, i) + pThreadDoneByI(i) -} - -#plot(ticks, workDoneByI(100)) -plot(ticks, sapply(ticks, function(i) pThreadDoneByI(i))) - diff --git a/analysis/depristo/distributedGATK/runqueue.csh b/analysis/depristo/distributedGATK/runqueue.csh deleted file mode 100755 index 78539cb81..000000000 --- a/analysis/depristo/distributedGATK/runqueue.csh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/tcsh - -setenv CMD "java -Djava.io.tmpdir=/broad/shptmp/depristo/tmp -jar /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/Queue.jar -statusTo depristo -S /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/analysis/depristo/distributedGATK/distributedGATKPerformance.scala -bsub --gatkjarfile /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar -dataset HiSeq $argv[2-$#argv]" - -if ( $1 == 1 ) then - pushd short; $CMD -jobQueue hour -run & -else if ( $1 == 2 ) then - pushd long; $CMD -jobQueue gsa -long -run & -else - $CMD -endif diff --git a/analysis/depristo/genotypeAccuracy/commands.R b/analysis/depristo/genotypeAccuracy/commands.R deleted file mode 100644 index 4c49c273d..000000000 --- a/analysis/depristo/genotypeAccuracy/commands.R +++ /dev/null @@ -1,40 +0,0 @@ -require("lattice") -require("ggplot2") -require("splines") - -ymax = xmax = 30 -HAVE_RAW_DATA = F -if ( HAVE_RAW_DATA ) { - inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table" - #inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz" - r <- digestTable(inputDataFile) - d = r$d - eByComp = r$eByComp - countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts)) - print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax)) - + geom_abline(slope=1, linetype=2)) - # + geom_smooth(se=T, size=1.5, aes(weight=Sum))) -} else { - eByComp = read.table("~/Dropbox/GSA members/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T) -} - -#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD")) -#print(subset(eByComp, EmpiricalPofGQ < Inf)) - -goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf) - -print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2)) - -print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax)) - + geom_abline(slope=1, linetype=2) - + geom_smooth(se=F, aes(weight=Sum))) - -print(qplot(pGGivenD, pGGivenD - EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(-10,10)) - + geom_abline(slope=0, linetype=2) - + geom_smooth(se=F, method=lm, formula = y ~ ns(x,1), aes(weight=Sum))) - -# By tech -print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ ., color=technology, geom=c("blank"), group=technology, xlim=c(0,xmax), ylim=c(0,ymax)) -+ geom_abline(slope=1, linetype=2) -+ geom_smooth(se=T, size=1.5, aes(weight=Sum))) - diff --git a/analysis/depristo/genotypeAccuracy/digestTable.R b/analysis/depristo/genotypeAccuracy/digestTable.R deleted file mode 100644 index 1278abc47..000000000 --- a/analysis/depristo/genotypeAccuracy/digestTable.R +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/env Rscript - -require("ggplot2") - -args <- commandArgs(TRUE) -verbose = TRUE - -inputDataFile = args[1] -onCmdLine = ! is.na(inputDataFile) - -addEmpiricalPofG <- function(d) { - r = c() - # - # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp - # track is perfect. In reality the chip is at best Q30 accurate (replicate samples have - # level than this level of concordance). At low incoming confidence, we can effectively - # ignore this term but when the incoming Q is near or above Q30 this approximation clearly - # breaks down. - # - for ( i in 1:dim(d)[1] ) { - row = d[i,] - if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF - if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET - if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR - r = c(r, v / row$Sum) - } - - #print(length(r)) - d$EmpiricalPofG = r - d$EmpiricalPofGQ = round(-10*log10(1-r)) - return(d) -} - -genotypeCounts <- function(x) { - type = unique(x$variable)[1] - t = addmargins(table(x$comp)) - return(t) -} - - -digestTable <- function(inputDataFile) { - d = subset(read.table(inputDataFile, header=T), rg != "ALL") - d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent")) - d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G" - d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper" - d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent" - print(summary(d$technology)) - - eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts)) - return(list(d=d, eByComp = eByComp)) - #countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts)) -} - -writeMyTable <- function(t, name) { - write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep="")) -} - -if ( onCmdLine ) { - r <- digestTable(inputDataFile) - writeMyTable(r$eByComp, "eByComp") -} - diff --git a/analysis/depristo/s3GATKReport/GATKPolicy.txt b/analysis/depristo/s3GATKReport/GATKPolicy.txt deleted file mode 100644 index d68b40765..000000000 --- a/analysis/depristo/s3GATKReport/GATKPolicy.txt +++ /dev/null @@ -1,12 +0,0 @@ -{ - "Statement": [ - { - "Sid": "Stmt1296439478068", - "Action": [ - "s3:PutObject" - ], - "Effect": "Allow", - "Resource": "arn:aws:s3:::GATK_Run_Reports/*" - } - ] -} diff --git a/analysis/depristo/s3GATKReport/GATK_cred.txt b/analysis/depristo/s3GATKReport/GATK_cred.txt deleted file mode 100644 index 3491d6072..000000000 --- a/analysis/depristo/s3GATKReport/GATK_cred.txt +++ /dev/null @@ -1,2 +0,0 @@ -AKIAJXU7VIHBPDW4TDSQ -uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA diff --git a/analysis/depristo/s3GATKReport/GroupPolicy.txt b/analysis/depristo/s3GATKReport/GroupPolicy.txt deleted file mode 100644 index eb6d3bdaa..000000000 --- a/analysis/depristo/s3GATKReport/GroupPolicy.txt +++ /dev/null @@ -1,8 +0,0 @@ -{ - "Statement":[{ - "Effect":"Allow", - "Action":"*", - "Resource":"*" - } - ] -} diff --git a/analysis/depristo/s3GATKReport/IGVPolicy.txt b/analysis/depristo/s3GATKReport/IGVPolicy.txt deleted file mode 100755 index 3fb797e7d..000000000 --- a/analysis/depristo/s3GATKReport/IGVPolicy.txt +++ /dev/null @@ -1,12 +0,0 @@ -{ - "Statement": [ - { - "Sid": "Stmt1296439478068", - "Action": [ - "s3:PutObject" - ], - "Effect": "Allow", - "Resource": "arn:aws:s3:::IGV_crowdsourcing/*" - } - ] -} diff --git a/analysis/depristo/s3GATKReport/IGV_cred.txt b/analysis/depristo/s3GATKReport/IGV_cred.txt deleted file mode 100644 index f073a3ced..000000000 --- a/analysis/depristo/s3GATKReport/IGV_cred.txt +++ /dev/null @@ -1,2 +0,0 @@ -AKIAIM64MSUYNQ2465HQ -D+l3HfPQFWia9HF8rKh/fJ5+yNYsltWUpj0C7L0Z diff --git a/analysis/depristo/s3GATKReport/setupS3IGVUser.csh b/analysis/depristo/s3GATKReport/setupS3IGVUser.csh deleted file mode 100755 index 5fd1459fe..000000000 --- a/analysis/depristo/s3GATKReport/setupS3IGVUser.csh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/tcsh - -# download CLI tools -# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143 - -setenv JAVA_HOME /usr/ -setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0 -setenv PATH $AWS_IAM_HOME/bin:$PATH -setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key - -setenv CREATE_GROUPS false -setenv CREATE_IGV_USER false -setenv UPDATE_USER_KEYS false -setenv UPDATE_USER_POLICY true - -# Create the administrators group: -# we aren't actually using this, in fact -if ( $CREATE_GROUPS == true ) then -iam-groupcreate -g Admins -iam-grouplistbypath -iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt -iam-grouplistpolicies -g Admins -endif - -# Create the IGV user -- uncomment if the IGV user needs to be created from scratch -# update the secret key -if $CREATE_IGV_USER == true then -iam-usercreate -u IGV -k -v > IGV_cred.txt -endif - -# the user access and secret keys are in the IGV source file IGVRunReport.java -# and must be updated to be the most current ones -if $UPDATE_USER_KEYS == true then -iam-userdelkey -u IGV -k $1 # $1 -> current access key -iam-useraddkey -u IGV > IGV_cred.txt -cat IGV_cred.txt -endif - -echo "IGV user policies" -if $UPDATE_USER_POLICY == true then -echo "Deleting policy" -iam-userdelpolicy -u IGV -p IGVRunReportUploading -iam-useruploadpolicy -u IGV -p IGVRunReportUploading -f IGVPolicy.txt -endif -iam-userlistpolicies -u IGV -v diff --git a/analysis/depristo/s3GATKReport/setupS3User.csh b/analysis/depristo/s3GATKReport/setupS3User.csh deleted file mode 100755 index 081ba6c28..000000000 --- a/analysis/depristo/s3GATKReport/setupS3User.csh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/tcsh - -# download CLI tools -# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143 - -setenv JAVA_HOME /usr/ -setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0 -setenv PATH $AWS_IAM_HOME/bin:$PATH -setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key - -setenv CREATE_GROUPS false -setenv CREATE_GATK_USER false -setenv UPDATE_USER_KEYS false -setenv UPDATE_USER_POLICY true - -# Create the administrators group: -# we aren't actually using this, in fact -if ( $CREATE_GROUPS == true ) then -iam-groupcreate -g Admins -iam-grouplistbypath -iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt -iam-grouplistpolicies -g Admins -endif - -# Create the GATK user -- uncomment if the GATK user needs to be created from scratch -# update the secret key -if $CREATE_GATK_USER == true then -iam-usercreate -u GATK -k -v > GATK_cred.txt -endif - -# the user access and secret keys are in the GATK source file GATKRunReport.java -# and must be updated to be the most current ones -if $UPDATE_USER_KEYS == true then -iam-userdelkey -u GATK -k $1 # $1 -> current access key -iam-useraddkey -u GATK > GATK_cred.txt -cat GATK_cred.txt -endif - -echo "GATK user policies" -if $UPDATE_USER_POLICY == true then -echo "Deleting policy" -iam-userdelpolicy -u GATK -p GATKRunReportUploading -iam-useruploadpolicy -u GATK -p GATKRunReportUploading -f GATKPolicy.txt -endif -iam-userlistpolicies -u GATK -v diff --git a/archive/R/plot_q_emp_stated_hst.R b/archive/R/plot_q_emp_stated_hst.R deleted file mode 100755 index f020f944d..000000000 --- a/archive/R/plot_q_emp_stated_hst.R +++ /dev/null @@ -1,41 +0,0 @@ -#!/broad/tools/apps/R-2.6.0/bin/Rscript - -args <- commandArgs(TRUE) - -input = args[1] - -t=read.table(input, header=T) -#t=read.csv(input) -#par(mfrow=c(2,1), cex=1.2) - -#outfile = paste(input, ".quality_emp_v_stated.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) -outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") -pdf(outfile, height=7, width=7) -d.good <- t[t$nMismatches >= 1000,] -d.100 <- t[t$nMismatches < 100,] -d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,] -plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") -points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16) -points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16) -abline(0,1, lty=2) -dev.off() - -#outfile = paste(input, ".quality_emp_hist.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) -outfile = paste(input, ".quality_emp_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0) -plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") -axis(2,axTicks(2), format(axTicks(2), scientific=F)) -dev.off() - -# -# Plot Q reported histogram -# -outfile = paste(input, ".quality_rep_hist.pdf", sep="") -pdf(outfile, height=7, width=7) -hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0) -plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n") -axis(2,axTicks(2), format(axTicks(2), scientific=F)) -dev.off() diff --git a/archive/R/plot_qual_diff_v_cycle.R b/archive/R/plot_qual_diff_v_cycle.R deleted file mode 100755 index a9d4f5e46..000000000 --- a/archive/R/plot_qual_diff_v_cycle.R +++ /dev/null @@ -1,20 +0,0 @@ -#!/broad/tools/apps/R-2.6.0/bin/Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] - -#X11(width=7, height=14) -#outfile = paste(input, ".qual_diff_v_cycle.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) -outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -c <- read.table(input, header=T) -d.good <- c[c$nMismatches >= 100,] -d.100 <- c[c$nMismatches < 100,] -plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10)) -points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3) -#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16) - diff --git a/archive/R/plot_qual_diff_v_dinuc.R b/archive/R/plot_qual_diff_v_dinuc.R deleted file mode 100755 index 03b70d494..000000000 --- a/archive/R/plot_qual_diff_v_dinuc.R +++ /dev/null @@ -1,16 +0,0 @@ -#!/broad/tools/apps/R-2.6.0/bin/Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] - -#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) -outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="") -#d <- read.csv(input) -d <- read.table(input, header=T) -plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10)) diff --git a/archive/java/src/org/broadinstitute/sting/ACTransitionTable.java b/archive/java/src/org/broadinstitute/sting/ACTransitionTable.java deleted file mode 100755 index 59dfc7763..000000000 --- a/archive/java/src/org/broadinstitute/sting/ACTransitionTable.java +++ /dev/null @@ -1,273 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Nov 22, 2010 - * Time: 12:22:08 PM - * To change this template use File | Settings | File Templates. - */ -@Analysis(name = "ACTransitionMatrix", description = "Number of additional genotypes from each new sample; random permutations") -public class ACTransitionTable extends VariantEvaluator { - private final int NUM_PERMUTATIONS = 50; - private final double LOW_GQ_PCT = 0.95; - private final double LOW_GQ_THRSH = 30.0; - private boolean initialized = false; - private long skipped = 0l; - - @DataPoint(name="Het transitions",description="AC[s] = AC[s-1]+1 and AC[s] = AC[s-1]+2 transitions") - TransitionTable transitions = null; - @DataPoint(name="Private permutations",description="Marginal increase in number of sites per sample") - PermutationCounts privatePermutations; - @DataPoint(name="AC2 Permutations",description="Marginal increase in number of AC=2 sites, per sample") - PermutationCounts doubletonPermutations; - @DataPoint(name="AC3 Permutations",description="Marginal increase in number of tripleton sites, per sample") - PermutationCounts tripletonPermutations; - - String[][] permutations; - - public boolean enabled() { - return true; - } - - public int getComparisonOrder() { - return 2; - } - - public String getName() { - return "ACTransitionTable"; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval != null && ! initialized ) { - //this.veWalker.getLogger().warn("Initializing..."); - initialize(eval); - initialized = true; - } - - if ( isGood(eval) ) { - if ( comp != null && ! comp.isFiltered() ) { - return null; - } - - int order_offset = 0; - for ( String[] ordering : permutations ) { - int sample_offset = 0; - int variant_ac = 0; - for ( String sample : ordering ) { - if ( eval.getGenotype(sample).isHet() ) { - variant_ac++; - transitions.hetTransitionCounts[order_offset][variant_ac-1][sample_offset]++; - } else if ( eval.getGenotype(sample).isHomVar() ) { - variant_ac += 2; - transitions.homTransitionCounts[order_offset][variant_ac-1][sample_offset]++; - } else { - // todo -- note, unclear how to treat no calls. Is the hom in het,ref,ref,nocall,hom sample 4 or 5? - // todo -- do we want to tabulate P[sample i is not variant | some variant]? This is just combinatorics so i left it out - if ( variant_ac > 0 ) { - transitions.stationaryCounts[order_offset][variant_ac-1][sample_offset]++; - } - } - sample_offset ++; - } - order_offset++; - } - } else { - skipped++; - } - - return null; - } - - private boolean isGood(VariantContext vc) { - if ( vc == null || vc.isFiltered() || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites - return false; - } else { - Collection gtypes = vc.getGenotypes().values(); - int ngood = 0; - for ( Genotype g : gtypes) { - if ( g.isCalled() && g.getPhredScaledQual() >= LOW_GQ_THRSH ) { - ngood ++; - } - } - - return ( (0.0+ngood)/(0.0+gtypes.size()) >= LOW_GQ_PCT ); - } - } - - public ACTransitionTable(VariantEvalWalker parent) { - //super(parent); - } - - public void initialize(VariantContext vc) { - Set permuteSamples = vc.getSampleNames(); - permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()]; - //veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size())); - int offset = 0; - for ( String s : permuteSamples ) { - permutations[0][offset] = s; - offset ++; - } - - for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) { - permutations[p] = permutations[0].clone(); - for ( int o = 0; o < permutations[p].length; o ++ ) { - int r = (int) Math.floor(Math.random()*(o+1)); - String swap = permutations[p][r]; - permutations[p][r] = permutations[p][o]; - permutations[p][o] = swap; - } - } - - transitions = new TransitionTable(); - transitions.hetTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()]; - transitions.homTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()]; - transitions.stationaryCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()]; - privatePermutations = new PermutationCounts(1,transitions); - doubletonPermutations = new PermutationCounts(2,transitions); - tripletonPermutations = new PermutationCounts(3,transitions); - } - - public void finalizeEvaluation() { // note: data points are null when this is called (wtf?) - //veWalker.getLogger().info(String.format("Skipped: %d",skipped)); - } - - class TransitionTable implements TableType { - int[][][] hetTransitionCounts; - int[][][] homTransitionCounts; - int[][][] stationaryCounts; - String[][] countAverages; - String[] rowKeys = null; - String[] colKeys = null; - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - rowKeys = new String[3*hetTransitionCounts[0].length]; - for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { - rowKeys[i] = String.format("%s%d%s","AC_",i,"_(het)"); - } - for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { - rowKeys[hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(hom)"); - } - for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { - rowKeys[2*hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(ref)"); - } - } - - - return rowKeys; - } - - public String getCell(int x, int y) { - if ( countAverages == null ) { - countAverages = new String[hetTransitionCounts[0].length*3][hetTransitionCounts[0][0].length]; - for ( int sam = 0; sam < hetTransitionCounts[0][0].length; sam ++) { - for ( int idx = 0 ; idx < hetTransitionCounts[0].length; idx ++ ) { - int totalTimesAtACSample = 0; - int totalStationary = 0; - int totalAC1Shift = 0; - int totalAC2Shift = 0; - for ( int p = 0; p < hetTransitionCounts.length; p++ ) { - totalStationary += stationaryCounts[p][idx][sam]; - totalAC2Shift += (idx+2 >= hetTransitionCounts[0][0].length) ? 0 : homTransitionCounts[p][idx+2][sam]; - totalAC1Shift += (idx+1 >= hetTransitionCounts[0][0].length) ? 0 : hetTransitionCounts[p][idx+1][sam]; - } - totalTimesAtACSample = totalStationary+totalAC1Shift+totalAC2Shift; - countAverages[idx][sam] = formatProp(totalAC1Shift,totalTimesAtACSample); - countAverages[hetTransitionCounts[0].length+idx][sam] = formatProp(totalAC2Shift,totalTimesAtACSample); - countAverages[hetTransitionCounts[0].length*2+idx][sam] = formatProp(totalStationary,totalTimesAtACSample); - } - } - } - - return countAverages[x][y] == null ? "0.00" : countAverages[x][y]; - } - - private String formatProp(int num, int denom) { - return (denom != 0) ? String.format("%.4f", ((double) num)/denom) : "0.0"; - } - - public String getName() { return "AC Transition Tables"; } - - public Object[] getColumnKeys() { - if ( colKeys == null ) { - colKeys = new String[hetTransitionCounts[0][0].length]; - for ( int ac = 0; ac < hetTransitionCounts[0][0].length; ac ++ ) { - colKeys[ac] = String.format("Sample_%d",ac); - } - } - - return colKeys; - } - } - - - class PermutationCounts implements TableType { - int acToExtract; - TransitionTable table; - String[] rowNames; - String[] colNames; - - public PermutationCounts(int ac, TransitionTable tTable) { - acToExtract = ac; - table = tTable; - } - - public String[] getRowKeys() { - //System.out.printf("%s%n",table); - if ( rowNames == null ) { - rowNames = new String[table.stationaryCounts.length]; - for ( int p = 0 ; p < rowNames.length; p ++ ) { - rowNames[p] = String.format("Perm%d",p+1); - } - } - - return rowNames; - } - - public String[] getColumnKeys() { - if ( colNames == null ) { - colNames = new String[table.stationaryCounts[0][0].length]; - for ( int s = 0 ; s < colNames.length; s ++ ) { - colNames[s] = String.format("Sample%d",s+1); - } - } - - return colNames; - } - - public Integer getCell(int x, int y) { - return table.hetTransitionCounts[x][acToExtract-1][y] + - ( (acToExtract > table.homTransitionCounts[0][0].length) ? 0 : table.homTransitionCounts[x][acToExtract-1][y]); - } - - public String getName() { - return String.format("PermutationCountsAC%d",acToExtract); - } - - public void init() { - getRowKeys(); - getColumnKeys(); - getCell(1,1); - } - } - - -} - diff --git a/archive/java/src/org/broadinstitute/sting/AlleleBalanceInspector.java b/archive/java/src/org/broadinstitute/sting/AlleleBalanceInspector.java deleted file mode 100755 index 33335e99d..000000000 --- a/archive/java/src/org/broadinstitute/sting/AlleleBalanceInspector.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.RodGenotypeChipAsGFF; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.BaseUtils; - -/** - * Takes a BAM file and a Hapmap-chip file (via the -hc argument) and creates a table of reference allele - * percentage and alternate allele percentage for het, homvar, and other genotypes. - */ -public class AlleleBalanceInspector extends LocusWalker { - private int item = 1; - public void initialize() { - out.printf("item\tlocus\tref\tgenotype\tstate\tdepth\trefdepth\taltdepth\trefpct\taltpct%n"); - } - - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class); - - return hc != null && hc.getCalledGenotype().isVariant(ref.getBase()); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class); - - String state; - if (hc.getCalledGenotype().isHet()) { - state = "het"; - } else if (hc.getCalledGenotype().isHom()) { - state = "homvar"; - } else { - state = "other"; - } - - int refIndex = ref.getBaseIndex(); - int altIndex = -1; - for (char base : hc.getCalledGenotype().getBases().toCharArray()) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - - if (baseIndex != refIndex) { - altIndex = baseIndex; - } - } - - int[] baseCounts = context.getPileup().getBaseCounts(); - double sum = (double) (baseCounts[refIndex] + baseCounts[altIndex]); - double refPct = ((double) baseCounts[refIndex])/sum; - double altPct = ((double) baseCounts[altIndex])/sum; - - out.printf("%d\t%s\t%c\t%s\t%s\t%d\t%d\t%d\t%f\t%f%n", - item++, - context.getLocation(), - ref.getBase(), - hc.getCalledGenotype().getBases(), - state, - context.getPileup().getReads().size(), - baseCounts[refIndex], - baseCounts[altIndex], refPct, altPct); - - return null; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return null; //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/archive/java/src/org/broadinstitute/sting/AlleleFrequencyComparison.java b/archive/java/src/org/broadinstitute/sting/AlleleFrequencyComparison.java deleted file mode 100755 index 2b4d60253..000000000 --- a/archive/java/src/org/broadinstitute/sting/AlleleFrequencyComparison.java +++ /dev/null @@ -1,212 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; -import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * - */ - -@Analysis(name = "Allele Frequency Comparison", description = "Compare allele frequency and counts between eval and comp") -public class AlleleFrequencyComparison extends VariantEvaluator { - private static int MAX_AC_COUNT = 100; // todo -- command line argument? - - @DataPoint(description="Counts of eval frequency versus comp frequency") - AFTable afTable = new AFTable(); - - @DataPoint(description="Counts of eval AC versus comp AC") - ACTable acTable = new ACTable(MAX_AC_COUNT); - - public boolean enabled() { return true; } - - public int getComparisonOrder() { return 2; } - - public String getName() { return "Allele Frequency Comparison"; } - - public AlleleFrequencyComparison(VariantEvalWalker parent) { - //super(parent); - } - - //public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ! (isValidVC(eval) && isValidVC(comp)) ) { - return null; - } else { - // todo -- this is a godawful hack. The "right way" isn't working, so do it the unsafe way for now. Note that - // todo -- this precludes getting the AC/AF values from the info field because some may not be there... - /*if ( missingField(eval) ) { - recalculateCounts(eval); - } - if ( missingField(comp) ) { - recalculateCounts(comp); - }*/ - HashMap evalCounts = new HashMap(2); - HashMap compCounts = new HashMap(2); - - VariantContextUtils.calculateChromosomeCounts(eval,evalCounts,false); - VariantContextUtils.calculateChromosomeCounts(comp,compCounts,false); - afTable.update(((List)evalCounts.get("AF")).get(0),((List)compCounts.get("AF")).get(0)); - acTable.update(((List)evalCounts.get("AC")).get(0),((List)compCounts.get("AC")).get(0)); - } - - return null; // there is nothing interesting - } - - private static boolean missingField(final VariantContext vc) { - return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ); - } - - private void recalculateCounts(VariantContext vc) { - Map attributes = new HashMap(); - VariantContextUtils.calculateChromosomeCounts(vc,attributes,false); - vc = VariantContext.modifyAttributes(vc,attributes); - //getLogger().debug(String.format("%s %s | %s %s",attributes.get("AC"),attributes.get("AF"),vc.getAttribute("AC"),vc.getAttribute("AF"))); - if ( attributes.size() == 2 && missingField(vc) ) { - throw new org.broadinstitute.sting.utils.exceptions.StingException("VariantContext should have had attributes modified but did not"); - } - } - - private static boolean isValidVC(final VariantContext vc) { - return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1); - } - - private static double getAF(VariantContext vc) { - Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( af == null ) { - //throw new UserException("Variant context "+vc.getName()+" does not have allele frequency entry which is required for this walker"); - // still none after being re-computed; this is 0.00 - return 0.00; - } else if ( List.class.isAssignableFrom(af.getClass())) { - return ( (List) af ).get(0); - } else if ( String.class.isAssignableFrom(af.getClass())) { - // two possibilities - String s = (String) af; - try { - if ( s.startsWith("[") ) { - return Double.parseDouble(s.replace("\\[","").replace("\\]","")); - } else { - return Double.parseDouble(s); - } - } catch (NumberFormatException e) { - throw new UserException("Allele frequency field may be improperly formatted, found AF="+s,e); - } - } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { - return (Double) af; - } else { - throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass())); - } - } - - private static int getAC(VariantContext vc) { - Object ac = vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); - if ( ac == null ) { - // still none after being re computed; this is 0 - return 0; - } else if ( List.class.isAssignableFrom(ac.getClass())) { - return ( (List) ac ).get(0); - } else if ( String.class.isAssignableFrom(ac.getClass())) { - // two possibilities - String s = (String) ac; - try { - if ( s.startsWith("[") ) { - return Integer.parseInt(s.replace("\\[","").replace("\\]","")); - } else { - return Integer.parseInt(s); - } - } catch (NumberFormatException e) { - throw new UserException(String.format("Allele count field may be improperly formatted, found AC=%s for record %s:%d",ac,vc.getChr(),vc.getStart()),e); - } - } else if ( Integer.class.isAssignableFrom(ac.getClass())) { - return (Integer) ac; - } else { - throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass())); - } - } -} - -class AFTable implements TableType { - - protected int[][] afCounts = new int[101][101]; - - public Object[] getRowKeys() { - String[] afKeys = new String[101]; - for ( int f = 0; f < 101; f ++ ) { - afKeys[f] = String.format("%.2f",(f+0.0)/100.0); - } - - return afKeys; - } - - public Object[] getColumnKeys() { - return getRowKeys(); // nice thing about symmetric tables - } - - public Object getCell(int i, int j) { - return afCounts[i][j]; - } - - public String getName() { - return "Allele Frequency Concordance"; - } - - public void update(double eval, double comp) { - afCounts[af2index(eval)][af2index(comp)]++; - } - - private int af2index(double d) { - return (int) Math.round(100*d); - } -} - -class ACTable implements TableType { - protected int[][] acCounts; - protected int maxAC; - - public ACTable(int acMaximum) { - maxAC = acMaximum; - acCounts = new int[acMaximum+1][acMaximum+1]; - } - - public Object[] getRowKeys() { - String[] acKeys = new String[maxAC+1]; - for ( int i = 0 ; i <= maxAC ; i ++ ) { - acKeys[i] = String.format("%d",i); - } - - return acKeys; - } - - public Object[] getColumnKeys() { - return getRowKeys(); - } - - public Object getCell(int i, int j) { - return acCounts[i][j]; - } - - public String getName() { - return "Allele Counts Concordance"; - } - - public void update(int eval, int comp) { - eval = eval > maxAC ? maxAC : eval; - comp = comp > maxAC ? maxAC : comp; - - acCounts[eval][comp]++; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/AminoAcidTransition.java b/archive/java/src/org/broadinstitute/sting/AminoAcidTransition.java deleted file mode 100755 index d6997bda3..000000000 --- a/archive/java/src/org/broadinstitute/sting/AminoAcidTransition.java +++ /dev/null @@ -1,219 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; -import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.analysis.AminoAcid; -import org.broadinstitute.sting.utils.analysis.AminoAcidTable; -import org.broadinstitute.sting.utils.analysis.AminoAcidUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author chartl - * @since June 28, 2010 - */ - -@Analysis(name = "Amino Acid Transition", description = "Calculates the Transition Matrix for coding variants; entries are Total, Num. Ti, Num. Tv, Ratio") -public class AminoAcidTransition extends VariantEvaluator { - - //////////////////////////////////////////////////////////// - //// INTERNAL DATA POINT CLASSES - //////////////////////////////////////////////////////////// - - // a mapping from amino acid transition score histogram bin to Ti/Tv ratio - @DataPoint(description = "TiTv counts by amino acid change") - AminoAcidTiTvTable acidTable = null; - - class TiTvCount { - public int ti; - public int tv; - - public TiTvCount() { - ti = 0; - tv = 0; - } - - public int getTotal() { - return ti + tv; - } - - public double getRatio() { - return ( (double) ti )/(1.0+tv); - } - - public String toString() { - return String.format("%d:%d:%d:%.2f",getTotal(),ti,tv,getRatio()); - } - } - - class AminoAcidTiTvTable implements TableType { - - private TiTvCount[][] countsByAAChange; - - public AminoAcidTiTvTable() { - countsByAAChange = new TiTvCount[AminoAcid.values().length][AminoAcid.values().length]; - for ( int i = 0; i < AminoAcid.values().length; i ++ ) { - for ( int j = 0; j < AminoAcid.values().length; j++ ) { - countsByAAChange[i][j] = new TiTvCount(); - } - } - } - - public Object[] getRowKeys() { - return AminoAcidUtils.getAminoAcidCodes(); - - } - - public Object[] getColumnKeys() { - return AminoAcidUtils.getAminoAcidCodes(); - } - - public TiTvCount getCell(int x, int y) { - return countsByAAChange[x][y]; - } - - public String getName() { - return "AminoAcidTransitionTable"; - } - - public void update(AminoAcid reference, AminoAcid alternate, boolean isTransition) { - TiTvCount counter = countsByAAChange[reference.ordinal()][alternate.ordinal()]; - if ( isTransition ) { - counter.ti++; - } else { - counter.tv++; - } - } - } - - //////////////////////////////////////////////////////////// - //// CORE VARIANT EVALUATOR DATA AND METHODS - //////////////////////////////////////////////////////////// - - private String infoKey; - private String infoValueSplit; - private boolean useCodons; - private boolean enabled; - private AminoAcidTable lookup; - - public AminoAcidTransition(VariantEvalWalker parent) { - //super(parent); - //enabled = parent.aminoAcidTransitionKey != null; - enabled = true; - if ( enabled ) { - getParsingInformation(parent); - lookup = new AminoAcidTable(); - acidTable = new AminoAcidTiTvTable(); - } - } - - private void getParsingInformation(VariantEvalWalker parent) { - if ( enabled() ) { -// infoKey = parent.aminoAcidTransitionKey; -// infoValueSplit = parent.aminoAcidTransitionSplit; -// useCodons = parent.aatUseCodons; - - infoKey = null; - infoValueSplit = null; - useCodons = false; - - if ( infoKey == null ) { - throw new UserException.CommandLineException("No info-field key provided for amino acid tabulation. Please provide the appropriate key with -aatk."); - } - - if ( infoValueSplit == null ) { - throw new UserException.CommandLineException("No split string provided for amino acid tabulation. Please provide the split string with -aats"); - } - } - } - - public String getName() { - return "AminoAcidTransitionTable"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return enabled; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - String interesting = null; - //if ( eval != null && eval.hasAttribute(infoKey) ) { - if ( enabled && eval != null && eval.hasAttribute(infoKey) ) { - String[] parsedNames = ( (String) eval.getAttribute(infoKey)).split(infoValueSplit); - String first = "none"; - String second = "none"; - try { - first = parsedNames [0]; - second = parsedNames [1]; - } catch (ArrayIndexOutOfBoundsException e) { - //getLogger().warn("Error parsing variant context with value "+eval.getAttribute(infoKey)); - } - AminoAcid reference; - AminoAcid alternate; - if ( useCodons ) { - reference = lookup.getEukaryoticAA(first); - alternate = lookup.getEukaryoticAA(second); - } else { - reference = lookup.getAminoAcidByCode(first); - alternate = lookup.getAminoAcidByCode(second); - } - - //veWalker.getLogger().info(String.format("%s\t%s\t%s\t%s",first,second,reference,alternate)); - - if ( reference == null ) { - interesting = "Unknown Reference Codon"; - } else if ( alternate == null ) { - interesting = "Unknown Alternate Codon"; - } else { - acidTable.update(reference,alternate, VariantContextUtils.isTransition(eval)); - } - - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - //public void finalizeEvaluation() { - // - //} -} diff --git a/archive/java/src/org/broadinstitute/sting/BaseTransitionTableCalculatorJavaWalker.java b/archive/java/src/org/broadinstitute/sting/BaseTransitionTableCalculatorJavaWalker.java deleted file mode 100644 index f2601a63c..000000000 --- a/archive/java/src/org/broadinstitute/sting/BaseTransitionTableCalculatorJavaWalker.java +++ /dev/null @@ -1,518 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.File; -import java.util.*; -import java.io.PrintStream; -import java.io.FileNotFoundException; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Oct 12, 2009 - * Time: 2:43:06 PM - * To change this template use File | Settings | File Templates. - */ -@By(DataSource.REFERENCE) -@Reference(window=@Window(start=-3,stop=3)) -public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker,Set> implements TreeReducible> { - @Output - PrintStream out; - - @Argument(fullName="usePreviousBases", doc="Use previous bases of the reference as part of the calculation, uses the specified number, defaults to 0", required=false) - int nPreviousBases = 0; - @Argument(fullName="useSecondaryBase",doc="Use the secondary base of a read as part of the calculation", required=false) - boolean useSecondaryBase = false; - @Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false) - int confidentRefThreshold = 5; - @Argument(fullName="maxNumMismatches",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false) - int maxNumMismatches = 1; - @Argument(fullName="minMappingQuality", doc ="Set the alignment quality below which to ignore reads; defaults to 30", required = false) - int minMappingQuality = 30; - @Argument(fullName="minQualityScore", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to 20", required = false) - int minQualityScore = 20; - @Argument(fullName="usePileupMismatches", doc = "Use the number of mismatches in the pileup as a condition for the table", required=false) - boolean usePileupMismatches = false; - @Argument(fullName="usePreviousReadBases", doc="Use previous bases of the read as part of the calculation. Will ignore reads if there aren't this many previous bases. Uses the specified number. Defaults to 0", required=false) - int nPreviousReadBases = 0; - @Argument(fullName="useReadGroup", doc="Use the group number of the read as a condition of the table.", required = false) - boolean useReadGroup = false; - @Argument(fullName="outputFile", shortName="of", doc="Output to this file rather than standard out. Must be used with -nt.", required = false) - String outFilePath = null; - @Argument(fullName="forcePreviousReadBasesToMatchRef", doc="Forces previous read bases to match the reference", required = false) - boolean readBasesMustMatchRef = false; - - private UnifiedGenotyperEngine ug; - // private ReferenceContextWindow refWindow; - // private Set conditionalTables; - private List usePreviousBases; - private List previousBaseLoci; - - public void initialize() { - if ( nPreviousBases > 3 || ( nPreviousReadBases > 3 && readBasesMustMatchRef ) ) { - throw new UserException.CommandLineException("You have opted to use a number of previous bases in excess of 3. In order to do this you must change the reference window size in the walker itself."); - } - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.baseModel = BaseMismatchModel.THREE_STATE; - uac.ALL_BASES_MODE = true; - ug = new UnifiedGenotyperEngine(getToolkit(), uac); - // refWindow = new ReferenceContextWindow(nPreviousBases); - usePreviousBases = new ArrayList(); - previousBaseLoci = new ArrayList(); - - } - - public Set reduceInit() { - return new TreeSet(); - } - - public Set map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - ReadBackedPileup pileup = context.getBasePileup(); - Set newCounts = null; - //System.out.println(pileup.getBases()); - if ( baseIsUsable(tracker, ref, pileup, context) ) { - //System.out.println("Pileup will be used"); - if ( previousLociCanBeUsed(usePreviousBases,previousBaseLoci,context.getLocation()) ) { - for ( int r = 0; r < pileup.getReads().size(); r ++ ) { - if ( useRead ( pileup.getReads().get(r), pileup.getOffsets().get(r), ref ) ) { - newCounts = updateTables( newCounts, pileup.getReads().get(r), pileup.getOffsets().get(r), ref, pileup ); - } - } - } else { - updatePreviousBases(usePreviousBases,true,previousBaseLoci,context.getLocation() ); - } - } else { - updatePreviousBases( usePreviousBases,false,previousBaseLoci,context.getLocation() ); - } - - return newCounts; - } - - public Set reduce ( Set map, Set reduce ) { - if ( map != null && ! map.isEmpty() ) { - for ( BaseTransitionTable t : map ) { - boolean add = true; - for ( BaseTransitionTable r : reduce ) { - if ( r.conditionsMatch(t) ) { - r.incorporateTable(t); - add = false; - break; - } - } - if ( add ) { - reduce.add(t); - } - } - } - // System.out.println("Reduce: size of TransitionTable set is " + reduce.size() + " -- size of Map: " + (map != null ? map.size() : "null")); - return reduce; - } - - public Set treeReduce( Set reduce1, Set reduce2 ) { - // check to see if this is a truly tree-reducable calculation - if ( nPreviousBases >= 1 ) { - String errMsg = "Parallelization cannot be used with UsePreviousBases due to the fact that internal walker data specifies whether a previous reference base is usable or not."; - String errMsg2 = " This can cause cause concurrency issues and unpredictable behavior when used with parallelization. Either do not specify -nt, or try a the conjunction of "; - String errMsg3 = "--usePreviousReadBases and --forcePreviousReadBasesToMatchRef."; - throw new UserException.CommandLineException(errMsg+errMsg2+errMsg3); - } - return reduce(reduce1,reduce2); - } - - public void onTraversalDone( Set conditionalTables ) { - PrintStream output; - if ( outFilePath == null ) { - output = out; - } else { - try { - output = new PrintStream(outFilePath); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(new File(outFilePath), e); - } - } - output.print(createHeaderFromConditions()); - for ( BaseTransitionTable t : conditionalTables ) - t.print(output); - } - - public void updatePreviousBases(List usage, boolean canUse, List loci, GenomeLoc locus) { - // early return - if ( nPreviousBases < 1 ) { - return; - } - - if ( usage.size() <= nPreviousBases ) { - usage.add(canUse); - loci.add(locus); - } else { - usage.remove(0); - usage.add(canUse); - loci.remove(0); - loci.add(locus); - } - } - - public boolean previousLociCanBeUsed( List canUse, List loci, GenomeLoc locus ) { - if ( nPreviousBases < 1 ) { - return true; - } - - boolean use = true; - for ( boolean b : canUse ) { - use = use && b; - } - - if ( use ) { - use = use && ( loci.get(0).distance(locus) == 1 ); // truly is PREVIOUS base - } - - return use; - } - - public Set updateTables ( Set tables, SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) { - List readConditions = buildConditions(read,offset,ref, pileup); - // System.out.println("Updating table with pileup: "+pileup.getBases()+ ( read.getReadNegativeStrandFlag() ? "-" : "+" ) + " Quality: "+read.getBaseQualities()[offset] + " MapQ: "+read.getMappingQuality()); - - if ( tables == null ) { - tables = new TreeSet(); - } - - boolean createNewTable = true; - - for ( BaseTransitionTable t : tables ) { - if ( t.conditionsMatch(readConditions) ) { - updateTable(t,read,offset,ref); - createNewTable = false; - break; - } - } - - if ( createNewTable ) { - BaseTransitionTable t = new BaseTransitionTable(readConditions); - updateTable(t,read,offset,ref); - tables.add(t); - } - - return tables; - } - - public void updateTable(BaseTransitionTable t, SAMRecord r, int o, ReferenceContext ref) { - // System.out.println("Update Table"); - if ( r.getReadNegativeStrandFlag() ) { - t.update((byte)BaseUtils.simpleComplement((char) r.getReadBases()[o]), (byte)BaseUtils.simpleComplement(ref.getBaseAsChar())); - } else { - t.update(r.getReadBases()[o], ref.getBase()); - } - } - - public boolean useRead( SAMRecord read, int offset, ReferenceContext ref ) { - - if ( Character.toUpperCase(read.getReadBases()[offset]) == Character.toUpperCase(ref.getBase()) ) { - return false; - } else if ( read.getMappingQuality() <= minMappingQuality ) { - return false; - } else if ( ! BaseUtils.isRegularBase( (char) read.getReadBases()[offset]) ) { - return false; - } else if ( read.getBaseQualities()[offset] <= minQualityScore ) { - return false; - } else if ( useSecondaryBase && read.getAttribute("SQ") == null ) { - return false; - } else if ( nPreviousBases >= 1 && previousReadBasesMismatchRef(read, offset, ref) ) { - return false; - } else if ( nPreviousReadBases >= 1 && readLacksPreviousBases(read,offset,nPreviousReadBases) ) { - return false; - } else if ( nPreviousReadBases >= 1 && readBasesMustMatchRef && previousReadBasesMismatchRef(read, offset, ref) ) { - return false; - } else { - return true; - } - } - - public boolean previousReadBasesMismatchRef( SAMRecord read, int offset, ReferenceContext ref ) { - int c = read.getReadNegativeStrandFlag() ? 1 : -1; - if ( offset + nPreviousBases*c < 0 ) { - return true; - } else if ( offset + nPreviousBases*c > read.getReadLength() ) { - return true; - } - - for ( int prevBase = 1; prevBase <= nPreviousBases; prevBase ++ ) { - if ( Character.toUpperCase(read.getReadBases()[offset + prevBase*c]) != Character.toUpperCase(ref.getBases()[nPreviousBases+1+prevBase*c]) || ! BaseUtils.isRegularBase(ref.getBases()[nPreviousBases+1+prevBase*c])) { - return true; - } - } - - return false; - } - - public boolean readLacksPreviousBases( SAMRecord read, int offset, int prevBases ) { - if ( ! read.getReadNegativeStrandFlag() ) { - return offset - prevBases < 0; - } else { - return offset + prevBases + 1 >= read.getReadLength(); - } - } - - public List buildConditions( SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) { - ArrayList conditions = new ArrayList(); - - if ( nPreviousBases > 0 ) { - conditions.add(buildRefString(ref,nPreviousBases, ! read.getReadNegativeStrandFlag())); - - } - - if ( useSecondaryBase ) { - conditions.add(getSecondaryBase(read,offset)); - } - - if ( nPreviousReadBases > 0 ) { - conditions.add(buildReadString(read, offset, nPreviousReadBases)); - } - - if ( usePileupMismatches ) { - conditions.add(countMismatches(ref.getBase(), pileup)); - } - - if ( useReadGroup ) { - conditions.add(read.getReadGroup().getReadGroupId()); - } - - return conditions; - } - - public String buildRefString(ReferenceContext ref, int bases, boolean forwardRead) { - if ( forwardRead ) { - return ( new String(ref.getBases()) ).substring(0,nPreviousBases-1); - } else { - return BaseUtils.simpleReverseComplement( ( new String(ref.getBases()) ).substring(nPreviousBases+1) ); - } - } - - public String buildReadString( SAMRecord read, int offset, int nPreviousReadBases ) { - if ( ! read.getReadNegativeStrandFlag() ) { - return read.getReadString().substring(offset-nPreviousReadBases,offset); - } else { - return BaseUtils.simpleReverseComplement( read.getReadString().substring(offset+1,offset+nPreviousReadBases+1) ); - } - } - - public String createHeaderFromConditions() { - String header = "Observed_base\tTrue_base"; - - if ( nPreviousBases > 0) { - header = header+"\tPrevious_"+nPreviousBases+"_bases"; - } - - if ( useSecondaryBase ) { - header = header + "\tSecondary_base"; - } - - if ( nPreviousReadBases > 0 ) { - header = header + "\tPrevious_"+nPreviousReadBases+"_read_bases"; - } - - if ( usePileupMismatches ) { - header = header + "\tNumber_of_pileup_mismatches"; - } - - if ( useReadGroup ) { - header = header + "\tRead_group"; - } - - return String.format("%s\t%s%n",header,"Counts"); - } - - public int countMismatches(byte ref, ReadBackedPileup p) { - int refM = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(ref)]; - return p.size()-refM; - } - - public char getSecondaryBase ( SAMRecord read, int offset ) { - return BaseUtils.baseIndexToSimpleBaseAsChar(QualityUtils.compressedQualityToBaseIndex( ( (byte[]) read.getAttribute("SQ") )[offset] ) ); - } - - public boolean baseIsUsable ( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) { - return pileupContainsNoNs(pileup) && baseIsConfidentRef(tracker,ref,context) && pileupBelowMismatchThreshold(ref,pileup); - } - - public boolean pileupBelowMismatchThreshold( ReferenceContext ref, ReadBackedPileup pileup ) { - return countMismatches(ref.getBase(), pileup) <= maxNumMismatches; - } - - public boolean pileupContainsNoNs(ReadBackedPileup pileup) { - for ( byte c : pileup.getBases() ) { - if ( c == 'N' ) { - return false; - } - } - - return true; - } - - public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - if ( !BaseUtils.isRegularBase(ref.getBase()) ) - return false; - VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context); - if ( calls == null || calls.vc == null) - return false; - return ( calls.vc.getNSamples() > 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold ); - - } - -} - - -class BaseTransitionTable implements Comparable { - - /* - * no direct manipulation of these objects ever - */ - private int[][] table; - private List conditions; - - public BaseTransitionTable(List conditions) { - table = new int[BaseUtils.BASES.length][BaseUtils.BASES.length]; - for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) { - for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) { - table[i][j]=0; - } - } - - this.conditions = conditions; - } - - public boolean conditionsMatch(Object obj) { - if ( obj == null ) { - return false; - } else if ( obj instanceof BaseTransitionTable ) { - return ((BaseTransitionTable) obj).conditionsMatch(conditions); - } else if ( ! (obj instanceof List) ) { - - return false; - } else if ( this.numConditions() != ((List)obj).size() ){ - return false; - } else { - boolean eq = true; - ListIterator thisIter = this.getConditionIterator(); - ListIterator thatIter = ((List)obj).listIterator(); - - while ( thisIter.hasNext() ) { - eq = eq && thisIter.next().equals(thatIter.next()); - } - - return eq; - } - } - - - public int compareTo(Object obj) { - if ( ! ( obj instanceof BaseTransitionTable ) ) { - return -1; - } else { - BaseTransitionTable t = (BaseTransitionTable) obj; - if ( this.conditionsMatch(t.conditions) ) { - return 0; - } else { - if ( this.numConditions() == t.numConditions() ) { - ListIterator thisIter = this.conditions.listIterator(); - ListIterator thatIter = t.conditions.listIterator(); - int g = 0; - do { - g = thisIter.next().compareTo(thatIter.next()); - } while ( g == 0 ); - - return g; - - } else { - return (this.numConditions() > t.numConditions() ) ? 1 : -1; - } - } - } - - } - - public void print( PrintStream out ) { - StringBuilder s = new StringBuilder(); - for ( byte observedBase : BaseUtils.BASES ) { - for ( byte refBase : BaseUtils.BASES ) { - s.append(String.format("%s\t%s",(char)observedBase,(char)refBase)); - for ( Comparable c : conditions ) { - s.append(String.format("\t%s",c.toString())); - } - s.append(String.format("\t%d%n", table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)])); - } - } - - out.print(s.toString()); - } - - public void update(byte observedBase, byte refBase ) { - //if ( observedBase == refBase ) { - // throw new StingException("BaseTransitionTable received equal observed and reference bases, which should not happen."); - //} - // System.out.println("Table updating: Observed Base: "+observedBase+" Ref base: "+refBase); - table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]++; - } - - public int numConditions() { - return conditions.size(); - } - - private Comparable getCondition(int offset) { - return conditions.get(offset); - } - - private ListIterator getConditionIterator() { - return conditions.listIterator(); - } - - public void incorporateTable(BaseTransitionTable t) { - for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) { - for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) { - table[i][j] += t.observationsOf(i,j); - } - } - } - - public int observationsOf( int observedBaseIndex, int referenceBaseIndex ) { - return table[observedBaseIndex][referenceBaseIndex]; - } - -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/ConcordanceTruthTable.java b/archive/java/src/org/broadinstitute/sting/ConcordanceTruthTable.java deleted file mode 100755 index 8f1b10598..000000000 --- a/archive/java/src/org/broadinstitute/sting/ConcordanceTruthTable.java +++ /dev/null @@ -1,345 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - - -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.Variation; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.Utils; - -import java.util.*; - -/** - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - *

- * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -public class ConcordanceTruthTable { - public static final int TRUE_POSITIVE = 0; - public static final int TRUE_NEGATIVE = 1; - public static final int FALSE_POSITIVE = 2; - public static final int FALSE_NEGATIVE = 3; - public static final int VARIANT = 1; - private static final String[] POOL_HEADERS = {"TP","TN","FP","FN"}; - - public static final int REF = 0; - public static final int VAR_HET = 1; - public static final int VAR_HOM = 2; - public static final int UNKNOWN = 3; - public static final int NO_CALL = 3; // synonym - private static final String[] TRUTH_NAMES = {"IS_REF", "IS_VAR_HET", "IS_VAR_HOM", "UNKNOWN"}; - private static final String[] CALL_NAMES = {"CALLED_REF", "CALLED_VAR_HET", "CALLED_VAR_HOM", "NO_CALL"}; - - private String name = null; - private boolean singleSampleMode; - - private int[][] table; - private int[] truth_totals; - private int[] calls_totals; - - - public ConcordanceTruthTable(String name) { - // there's a specific sample associated with this truth table - this.name = name; - singleSampleMode = true; - - table = new int[4][4]; - truth_totals = new int[4]; - calls_totals = new int[4]; - for (int i = 0; i < 4; i++) { - truth_totals[i] = 0; - calls_totals[i] = 0; - for (int j = 0; j < 4; j++) - table[i][j] = 0; - } - } - - public ConcordanceTruthTable(int nSamples) { - // there's no specific sample associated with this truth table - singleSampleMode = false; - name = "pooled_concordance"; - truth_totals = new int[4]; - calls_totals = new int[4]; - for (int i = 0; i < 4; i++) { - truth_totals[i] = 0; - calls_totals[i] = 0; - } - - initializeFrequencyTable(nSamples); - } - - private void initializeFrequencyTable( int numChips ) { - // System.out.println("Frequency Table for Pooled Concordance initialized with number of chips = "+numChips); - table = new int[numChips*2][4]; - for (int i = 0; i < 4; i++) { - for ( int freq = 0; freq < 2*numChips; freq ++ ) { - table[freq][i] = 0; - } - } - - // System.out.println("Table Size: "+table.length+" by "+table[1].length); - } - - public String addEntry(List> chipEvals, Variation eval, char ref) { - String violation = null; - - // if the table represents a single sample, then we can calculate genotype stats - if ( singleSampleMode ) { - for ( Pair chipEval : chipEvals ) { - - Genotype chipG = chipEval.first; - Genotype evalG = chipEval.second; - - if (chipG == null && evalG == null) - continue; - - int truthType = getGenotype(chipG, ref); - int callType = getGenotype(evalG, ref); - - //System.out.printf("TEST: %d/%d %s vs. %s%n", truthIndex, callIndex, chip, eval); - if ( truthType == VARIANT && callType != VARIANT ) { - violation = String.format("False negative: ref=%c chip=%s call=%s", ref, chipG, evalG); - } else if ( truthType == REF && callType == VARIANT ) { - violation = String.format("False positive: chip=%s call=%s", chipG, evalG); - } - - addGenotypeEntry(truthType, callType); - } - } else { // if we cannot associate tables with individuals, then we are working in a pooled context - // first we need to expand our tables to include frequency information - Pair > poolVariant = getPooledAlleleFrequency(chipEvals, ref); - - int truthType = poolVariant.getFirst(); // convenience method; now to interpret - int callType = getCallIndex(eval,ref); - - int numTrueSupportingAlleles = poolVariant.getSecond().getFirst(); - if ( numTrueSupportingAlleles > 0 && truthType == VARIANT && callType != VARIANT ) { - violation = String.format("False negative: %s with %d alt alleles", chipEvals.get(0).getFirst(), numTrueSupportingAlleles); - } else if ( truthType == REF && callType == VARIANT ) { - violation = String.format("False positive: %s at hom-ref site", eval); - } - - addFrequencyEntry( truthType, callType, poolVariant.getSecond().getFirst() ); - - } - - // TODO -- implement me for pooled mode with frequency stats - // TODO -- You'll want to use eval and the chips from chipEvals (these are the first members of the pair) - // TODO -- You'll also need to declare (and initialize) the relevant data arrays for the data - // TODO -- Indexes like TRUE_POSITIVE are defined above for you - return violation; - } - - public Pair> getPooledAlleleFrequency( List> chips, char ref) { - // this is actually just a note that I wanted to appear in blue. This method explicitly uses - // the assumption that tri-allelic sites do not really exist, and that if they do the - // site will be marked as such by an 'N' in the reference, so we will not get to this point. - - int frequency = 0; - int nChips = 0; - if ( chips != null ) { - for ( Pair chip : chips ) { - Genotype c = chip.getFirst(); - if ( c != null ) { - nChips++; - if ( c.isVariant(ref) ) { - if ( c.isHet() ) { - frequency++; - } else { // c is hom - frequency += 2; - } - } - //System.out.printf(" Genotype %s at %c => %d%n", c, ref, frequency); - } - } - //System.out.printf("*** %d%n", frequency); - } - - int truthType = nChips > 0 ? ( frequency > 0 ? VARIANT : REF ) : NO_CALL; - return new Pair >(truthType, new Pair(frequency,nChips)); - } - - private void addFrequencyEntry( int truthIndex, int callIndex, int numTrueSupportingAlleles ) { - //System.out.printf(" %s %s %d%n", CALL_NAMES[truthIndex], CALL_NAMES[callIndex], numTrueSupportingAlleles); - calls_totals[callIndex]++; - truth_totals[truthIndex]++; - - if ( truthIndex == REF && ( callIndex == REF || callIndex == NO_CALL ) ) { - // true negative - table[numTrueSupportingAlleles][TRUE_NEGATIVE]++; - // sanity check - there should never be an entry in - // [*][TRUE_NEGATIVE] for * > 0 - } else if ( truthIndex == REF && callIndex == VARIANT ) { - // false positive - table[numTrueSupportingAlleles][FALSE_POSITIVE]++; - } else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) { - // false negative - table[numTrueSupportingAlleles][FALSE_NEGATIVE]++; - } else if ( truthIndex == VARIANT && callIndex == VARIANT ) { - // true positive - table[numTrueSupportingAlleles][TRUE_POSITIVE]++; - } else { - // something else is going on; wonky site or something. Don't do anything to the table. - } - } - - private static int getCallIndex(Variation eval, char ref) { - int index; - - if ( eval == null ) { - index = NO_CALL; - } else if ( ! eval.isSNP() ) { - index = REF; - } else { - index = VARIANT; - } - - return index; - } - - private static int getGenotype(Genotype g, char ref) { - int type; - - if ( g == null ) - type = NO_CALL; - else if ( !g.isVariant(ref) ) - type = REF; - else if ( g.isHet() ) - type = VAR_HET; - else - type = VAR_HOM; - - return type; - } - - private void addGenotypeEntry(int truthIndex, int callIndex) { - table[truthIndex][callIndex]++; - truth_totals[truthIndex]++; - calls_totals[callIndex]++; - } - - public void addAllStats(List s) { - if ( singleSampleMode ) - addGenotypeStats(s); - else - addFrequencyStats(s); - } - -// private void addFrequencyStats(List s) { -// -// // TODO -- implement me for pooled mode with frequency stats -// s.add(String.format("name %s",name)); -// s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY")); -// -// for ( int af = 0; af < table.length; af ++ ) { -// for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) { -// StringBuffer sb = new StringBuffer(); -// sb.append(String.format("%f ", ((double) af)/ table.length)); -// sb.append(String.format("%s ",POOL_HEADERS[errorIndex])); -// sb.append(String.format("%d ", table[af][errorIndex])); -// sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex))); -// sb.append(String.format("%s ", marginalPercent(table[af],errorIndex))); -// s.add(sb.toString()); -// } -// } -// -// } - - private void addFrequencyStats(List s) { - s.add(String.format("name %s",name)); - s.add("TRUTH_ALLELE_COUNT\tTRUTH_ALLELE_FREQ\tTOTAL\t" + Utils.join(" ", POOL_HEADERS)); - - for ( int af = 0; af < table.length; af ++ ) { - int sum = 0; - String counts = ""; - for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) { - int count = table[af][errorIndex]; - sum += count; - counts += String.format(" %6d", count); - } - s.add(String.format("%6d %.3f %6d%s", af, ((double)af)/ table.length, sum, counts)); - } - - } - - private void addGenotypeStats(List s) { - s.add(String.format("name %s", name)); - s.add(String.format("TRUTH_STATE\tCALLED_REF\tCALLED_VAR_HET\tCALLED_VAR_HOM\tNO_CALL\t\tTOTALS\tTRUE_GENOTYPE_CONCORDANCE\tGENOTYPE_SENSITIVITY")); - for (int i = 0; i < 4; i++) { - StringBuffer sb = new StringBuffer(); - sb.append(String.format("%15s ", TRUTH_NAMES[i])); - for (int j = 0; j < 4; j++) - sb.append(String.format("%9d ", table[i][j])); - sb.append(String.format("%9d ", truth_totals[i])); - if (i == VAR_HET || i == VAR_HOM) { - sb.append(String.format("\t%s\t\t", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM]))); - sb.append(String.format("%s", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i]))); - } else { - sb.append("\tN/A\t\t\tN/A"); - } - s.add(sb.toString()); - } - - addCalledGenotypeConcordance(s); - addOverallStats(s); - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - s.add(String.format("%s_%s_%s %d", TRUTH_NAMES[i], CALL_NAMES[j], "NO_SITES", table[i][j])); - s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_TRUTH", cellPercent(table[i][j], truth_totals[i]))); - s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_CALLS", cellPercent(table[i][j], calls_totals[j]))); - } - if (i == VAR_HET || i == VAR_HOM) { - s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "TRUE_GENOTYPE_CONCORDANCE", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM]))); - s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "GENOTYPE_SENSITIVITY", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i]))); - } - } - } - - private void addCalledGenotypeConcordance(List s) { - StringBuilder sb = new StringBuilder(); - sb.append("CALLED_GENOTYPE_CONCORDANCE\t"); - for (int i = 0; i < 4; i++) { - int nConcordantCallsI = table[i][i]; - String value = "N/A"; - if (i != UNKNOWN) - value = String.format("%s\t", cellPercent(nConcordantCallsI, calls_totals[i] - table[UNKNOWN][i])); - sb.append(value); - } - s.add(sb.toString()); - } - - // How many overall calls where made that aren't NO_CALLS or UNKNOWNS? - private int getNCalled() { - int n = 0; - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) - if (i != NO_CALL && j != NO_CALL) n += table[i][j]; - return n; - } - - private void addOverallStats(List s) { - int nConcordantRefCalls = table[REF][REF]; - int nConcordantHetCalls = table[VAR_HET][VAR_HET]; - int nConcordantVarHomCalls = table[VAR_HOM][VAR_HOM]; - int nVarCalls = table[VAR_HOM][VAR_HET] + table[VAR_HOM][VAR_HOM] + table[VAR_HET][VAR_HET] + table[VAR_HET][VAR_HOM]; - int nConcordantVarCalls = nConcordantHetCalls + nConcordantVarHomCalls; - int nConcordantCalls = nConcordantRefCalls + nConcordantVarCalls; - int nTrueVar = truth_totals[VAR_HET] + truth_totals[VAR_HOM]; - int nCalled = getNCalled(); - s.add(String.format("VARIANT_SENSITIVITY %s", cellPercent(nVarCalls, nTrueVar))); - s.add(String.format("VARIANT_CONCORDANCE %s", cellPercent(nConcordantVarCalls, nVarCalls))); - s.add(String.format("OVERALL_CONCORDANCE %s", cellPercent(nConcordantCalls, nCalled))); - } - - private static String cellPercent(int count, int total) { - StringBuffer sb = new StringBuffer(); - total = Math.max(total, 0); - sb.append(String.format("%.2f", (100.0 * count) / total)); - sb.append("%"); - return sb.toString(); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/DSBWalker.java b/archive/java/src/org/broadinstitute/sting/DSBWalker.java deleted file mode 100644 index 71c5c26dc..000000000 --- a/archive/java/src/org/broadinstitute/sting/DSBWalker.java +++ /dev/null @@ -1,167 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import net.sf.samtools.SAMRecord; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Dec 3, 2009 - * Time: 11:54:35 AM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) - -public class DSBWalker extends LocusWalker { - @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true) - int COV_CUTOFF = 0; - @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false) - int MINLENGTH_CUTOFF = 0; - - private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another - - private long maxcov = 0; - private long maxz = 0; - private long mergedmaxcov = 0; - private long mergedmaxz = 0; - GenomeLoc mergedInterval = null; - GenomeLoc currentInterval = null; - - private long nIntervals = 0; - - private void emit(GenomeLoc l) { - if ( mergedInterval == null ) { - mergedInterval = l.clone(); - mergedmaxcov = maxcov; - mergedmaxz = maxz; - return; - } - - if ( mergedInterval.getContigIndex() != l.getContigIndex() ) { - long length = mergedInterval.getStop()-mergedInterval.getStart()+1; - if ( length >= MINLENGTH_CUTOFF ) { - out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval - nIntervals++; - } - mergedInterval = l.clone(); - mergedmaxcov = maxcov; - mergedmaxz = maxz; - return; - } - - // merged interval exists and new interval is on the same contig. Check if the new interval - // is close enough so we got to merge and keep waiting: - - if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) { - mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop()); - if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov; - if ( maxz > mergedmaxz ) mergedmaxz = maxz; - return; - } - - // nope, new interval is far enough. Print old one and keep current one. - - long length = mergedInterval.getStop()-mergedInterval.getStart()+1; - if ( length >= MINLENGTH_CUTOFF ) { - out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval - nIntervals++; - } - mergedInterval = l.clone(); - mergedmaxcov = maxcov; - mergedmaxz = maxz; - - } - - public void onTraversalDone() { - if ( mergedInterval != null ) { - long length = mergedInterval.getStop()-mergedInterval.getStart()+1; - if ( length >= MINLENGTH_CUTOFF ) { - out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval - nIntervals++; - } - } - System.out.println(nIntervals+" intervals detected."); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - ReadBackedPileup pileup = context.getPileup(); - List reads = pileup.getReads(); - - int nZero = pileup.getNumberOfMappingQualityZeroReads(); - - int nonZCoverage = reads.size() - nZero; - - if ( nonZCoverage >= COV_CUTOFF ) { - - // if we were not inside an interval, start one: - if ( currentInterval == null ) { - maxcov = nonZCoverage; - maxz = nZero; - currentInterval = context.getLocation().clone(); -// System.out.println("Setting current to "+currentInterval); - return 0; - } - - // if we were inside an interval and we just jumped onto a new contig, get rid of the old interval - if ( currentInterval.compareContigs(context.getLocation()) != 0 ) { - // we just moved to a new contig - System.out.println("On contig "+context.getLocation().getContig()); - emit(currentInterval); - maxcov = nonZCoverage; - maxz = nZero; - currentInterval = context.getLocation().clone(); - return 0; - } - - // we are on the same contig, we are within the interval, so we need to extend the current interval: - currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop - //System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")"); - if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov - if ( nZero > maxz ) maxz = nZero; // adjust maxz - } else { - // low coverage, if we were inside an interval, it stops now: - if ( currentInterval != null ) { - // System.out.println("Emitting current as "+currentInterval); - emit(currentInterval); - currentInterval = null; - maxcov = 0; - maxz = 0; - } - } - - return 0; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return sum+value; //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/archive/java/src/org/broadinstitute/sting/DSBWalkerV2.java b/archive/java/src/org/broadinstitute/sting/DSBWalkerV2.java deleted file mode 100644 index cf7677ac6..000000000 --- a/archive/java/src/org/broadinstitute/sting/DSBWalkerV2.java +++ /dev/null @@ -1,360 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.CircularArray; -import org.broadinstitute.sting.utils.PrimitivePair; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import net.sf.samtools.SAMRecord; - -import java.util.List; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Dec 12, 2009 - * Time: 2:25:44 PM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) - -public class DSBWalkerV2 extends LocusWalker { -// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true) -// int COV_CUTOFF = 0; -// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false) -// int MINLENGTH_CUTOFF = 0; - @Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true) - int WINDOW_SIZE = 100; - @Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true) - double ENRICHMENT_CUTOFF = 5.0; - @Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+ - "(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+ - "so that control*enrichmentCutoff is too low to be convincing)",required=true) - int MIN_SIGNAL = 10; - - private CircularArray signalWindow = null; - private CircularArray controlWindow = null; - private CircularArray signalStrandsWindow = null; - private CircularArray controlStrandsWindow = null; - - private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long(); - private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long(); - private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long(); - private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long(); - - private Set signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able - private Set controlReadGroups ; // to properly assign the reads coming from a merged stream - - private long windowStart = -1; - private long windowStop = -1; - private int curContig = -1; - private String curContigName = ""; - - // the following variables are for buffering and merging windows : - private long regionStart = -1; - private long lastWindowStart = -1; - private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int(); - private PrimitivePair.Int minSignalReads = new PrimitivePair.Int(); - private PrimitivePair.Int maxControlReads = new PrimitivePair.Int(); - private PrimitivePair.Int minControlReads = new PrimitivePair.Int(); - private double minEnrichmentUnique; - private double maxEnrichmentUnique; - private double minEnrichmentNonUnique; - private double maxEnrichmentNonUnique; - private double minEnrichmentTotal; - private double maxEnrichmentTotal; - private double minUniqueSignalStrandBalance = 0.0; - private double maxUniqueSignalStrandBalance = 0.0; - private double minNonUniqueSignalStrandBalance = 0.0; - private double maxNonUniqueSignalStrandBalance = 0.0; - private double minUniqueControlStrandBalance = 0.0; - private double maxUniqueControlStrandBalance = 0.0; - private double minNonUniqueControlStrandBalance = 0.0; - private double maxNonUniqueControlStrandBalance = 0.0; - - @Override - public void initialize() { - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( nSams != 2 ) { - out.println("ERROR: two input bam files (signal and backround control) must be specified"); - System.exit(1); - } - List> readGroupSets = getToolkit().getMergedReadGroupsByReaders(); - signalReadGroups = readGroupSets.get(0); -// System.out.println(signalReadGroups.size()+" read groups in signal"); - controlReadGroups = readGroupSets.get(1); -// System.out.println(controlReadGroups.size()+" read groups in control"); - signalWindow = new CircularArray(WINDOW_SIZE); - controlWindow = new CircularArray(WINDOW_SIZE); - signalStrandsWindow = new CircularArray(WINDOW_SIZE); - controlStrandsWindow = new CircularArray(WINDOW_SIZE); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - ReadBackedPileup pileup = context.getPileup(); - List reads = pileup.getReads(); - - // compute coverages at the current site: - PrimitivePair.Int signalCov = new PrimitivePair.Int(); - PrimitivePair.Int controlCov = new PrimitivePair.Int(); - PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int(); - PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int(); - - for ( SAMRecord r : reads ) { - if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) { - if ( r.getMappingQuality() == 0 ) { - signalCov.second++; - if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++; - } - else { - signalCov.first++; - if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++; - } - } else { - if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) { - if ( r.getMappingQuality() == 0 ) { - controlCov.second++; - if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++; - } - else { - controlCov.first++; - if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++; - } - } else { - throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")"); - } - } - } - - GenomeLoc loc = context.getLocation(); - - // if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second); - - if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) { - // we jumped to the next contig, or we are on the same contig but the current position is - // more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift) - checkCurrentWindow(true); - - if ( loc.getContigIndex() != curContig ) { - System.out.println("on contig "+loc.getContig()); - } - curContig = loc.getContigIndex(); - curContigName = loc.getContig(); -// prevPos = loc.getStart(); - windowStart = loc.getStart(); - windowStop = windowStart + WINDOW_SIZE - 1; - signalWindow.clear(); - controlWindow.clear(); - totalSignalCoverage.assignFrom( signalCov ); - totalControlCoverage.assignFrom( controlCov ); - totalSignalFwdStrands.assignFrom( signalFwdStrands ); - totalControlFwdStrands.assignFrom( controlFwdStrands ); - signalWindow.set(0,signalCov); - controlWindow.set(0,controlCov); - signalStrandsWindow.set(0,signalFwdStrands); - controlStrandsWindow.set(0,controlFwdStrands); - return 1; - } - - // offset of the current position w.r.t. the start of the window: - int offset = (int)(loc.getStart() - windowStart); - - if ( offset >= WINDOW_SIZE ) { - // if we are here, the current position is outside of the current window, but not - // far enough so that we'd need to reinitialize the window from scratch (that was already checked above). - // Now we need to shift. - - // We are receiving covered positions in order, so we are guaranteed that everything prior to - // the current position was already counted; if some elements of the windows are still nulls, it means - // there was no coverage there - - int shift = offset - WINDOW_SIZE + 1; - - // scroll the window(s) base by base until the current position is inside the window. At each step - // we will check if the window meets the requirements and should be printed out. - for ( int i = 0 ; i < shift ; i++ ) { - - // we are going to shift; check if the window as it is now is worth printing - checkCurrentWindow(false); - - // discard coverage from the first element of the window (this element is about to be shifted out of scope) - if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0)); - if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0)); - - if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0)); - if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0)); - - // advnace window coordinates on the ref - windowStart++; - windowStop++; - - // shift the data in the window(s): - signalWindow.shiftData(1); - controlWindow.shiftData(1); - signalStrandsWindow.shiftData(1); - controlStrandsWindow.shiftData(1); - - offset--; // this is the new offset w.r.t. to the shifted window - } - - } - - // at this point, either the current position was inside the current window, or it was outside, - // but the window was already shifted - totalSignalCoverage.add(signalCov); - totalControlCoverage.add(controlCov); - totalSignalFwdStrands.add(signalFwdStrands); - totalControlFwdStrands.add(controlFwdStrands); - signalWindow.set(offset,signalCov); - controlWindow.set(offset,controlCov); - signalStrandsWindow.set(offset,signalFwdStrands); - controlStrandsWindow.set(offset,controlFwdStrands); - return 1; - } - - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return sum+value; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public void onTraversalDone(Integer result) { - printRegion(); - super.onTraversalDone(result); - } - - /** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so. - * If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with - * the new window if it passes the cutoffs, or left empty. - * - */ - private void checkCurrentWindow(boolean force) { - if ( force ) printRegion(); - if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later - if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low - if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal - return; - } - - // if we have decent coverage in control, just check for required enrichment in the signal - if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false); - } - - /** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that - * are close enough and prints only when a train of close-by windows has ended and next window received is far enough - */ - private void emitWindow(boolean force) { - - if ( regionStart == -1 ) { - resetBuffer(); - return; - } - - if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) { - // new window is far enough from the region we were buffering: emit old region - - printRegion(); - resetBuffer(); - return; - } - - // current window is too close (overlapping) with a previous one: we need to merge - - lastWindowStart = windowStart; - maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0)); - maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0)); - minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0)); - minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0)); - maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0)); - maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0)); - minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0)); - minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0)); - maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0)); - minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0)); - maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0)); - minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) ); - maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ - ((totalControlCoverage.first+ totalControlCoverage.second)/36.0) ); - minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ - ((totalControlCoverage.first+ totalControlCoverage.second)/36.0) ); - - - maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first); - minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first); - maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second); - minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second); - maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first); - minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first); - maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second); - minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second); - - - } - - private void resetBuffer() { - regionStart = windowStart; - lastWindowStart = windowStart; - maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0); - maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0); - minSignalReads.assignFrom(maxSignalReads); - maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0); - maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0); - minControlReads.assignFrom(maxControlReads); - minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0); - minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0); - minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ - ((totalControlCoverage.first+ totalControlCoverage.second)/36.0); - - minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first; - minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second; - minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first; - minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second; - } - - private void printRegion() { - if ( regionStart == -1 ) return; - out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+ - minSignalReads.first+"-"+maxSignalReads.first+"\t"+ - minSignalReads.second+"-"+maxSignalReads.second+"\t"+ - minControlReads.first+"-"+maxControlReads.first+"\t"+ - minControlReads.second+"-"+maxControlReads.second+"\t"); - out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique); - out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique); - out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal); - out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance); - out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance); - out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance); - out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance); - - if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER"); - out.println(); - - regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty - } -} diff --git a/archive/java/src/org/broadinstitute/sting/DuplicateQualsWalker.java b/archive/java/src/org/broadinstitute/sting/DuplicateQualsWalker.java deleted file mode 100755 index f83e4e62e..000000000 --- a/archive/java/src/org/broadinstitute/sting/DuplicateQualsWalker.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.duplicates.DupUtils; -import org.broadinstitute.sting.utils.duplicates.DuplicateComp; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -class MismatchCounter { - long nObs = 0; - long nMismatches = 0; - - public void inc(long incNObs, long incNMismatches) { - nObs += incNObs; - nMismatches += incNMismatches; - } - - public void inc(boolean mismatchP) { - inc(1, mismatchP ? 1 : 0); - } - - - public double mismatchRate() { - return (double)nMismatches / nObs; - } - - public byte empiricalQualScore() { - return QualityUtils.probToQual(1 - mismatchRate(), 0); - } - - public String headerString() { - return "mismatchRate\tempiricalQ\tnObs\tnMismatches"; - } - - public String toString() { - return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches); - } -} - -class QualityTracker { - final private int MAX_QUAL_SCORE = 100; - MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE]; - - public QualityTracker() { - for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) { - for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) { - mismatchesByQ[i][j] = new MismatchCounter(); - } - } - } - - public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) { - int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi); - int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi); - - if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q); - if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q); - - mismatchesByQ[b1Q][b2Q].inc(mismatchP); - } - - public void inc(DuplicateComp dc, boolean orderDependent) { - inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent); - } - - public int probMismatchQ1Q2(int q1, int q2) { - double e1 = 1 - QualityUtils.qualToProb(q1); - double e2 = 1 - QualityUtils.qualToProb(q2); - double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2; - return QualityUtils.probToQual(1 - eMM, 0.0); - } - - public void printToStream(PrintStream out, boolean filterUnobserved) { - out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString()); - for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) { - for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) { - MismatchCounter mc = mismatchesByQ[i][j]; - //System.out.printf("MC = %s%n", mc); - if ( filterUnobserved && mc.nObs == 0 ) - continue; - out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString()); - } - } - } -} - -public class DuplicateQualsWalker extends DuplicateWalker, QualityTracker> { - @Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data") - public boolean FILTER_UNOBSERVED_QUALS = false; - - @Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets") - public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100; - - @Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities") - public boolean COMBINE_QUALS = false; - - @Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities") - public boolean COMBINE_ALL_DUPS = false; - - @Argument(fullName="orderDependent", required=false, doc="") - public boolean orderDependent = false; - - @Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site") - public boolean compareToUniqueReads = false; - - @Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself") - public boolean comparePairToSingleton = false; - - final boolean DEBUG = false; - final private boolean ACTUALLY_DO_WORK = true; - - public void onTraversalDone(QualityTracker result) { - result.printToStream(out, FILTER_UNOBSERVED_QUALS); - } - - public QualityTracker reduceInit() { - return new QualityTracker(); - } - - public QualityTracker reduce(List dupComps, QualityTracker tracker) { - for ( DuplicateComp dc : dupComps ) { - tracker.inc(dc, orderDependent); - } - - return tracker; - } - - // Print out data for regression - public List map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { - //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); - List pairwiseComps = new ArrayList(); - - // todo -- fixme -- the logic here is all wrong given new interface -// if ( ! ACTUALLY_DO_WORK ) -// return pairwiseComps; -// -// if ( COMBINE_QUALS ) { -// Pair combinedReads = DupUtils.combinedReadPair( duplicateReads ); -// if ( combinedReads != null ) { -// SAMRecord combined1 = combinedReads.first; -// SAMRecord combined2 = combinedReads.second; -// -// if ( comparePairToSingleton ) -// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads ); -// else -// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads ); -// } -// } else { -// int nComparisons = 0; -// for ( SAMRecord read1 : duplicateReads ) { -// for ( SAMRecord read2 : duplicateReads ) { -// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) { -// // the hashcode insures we don't do A vs. B and B vs. A -// //System.out.printf("Comparing %s against %s%n", read1, read2); -// nComparisons++; -// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads ); -// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) -// break; -// } -// } -// } -// } - - return pairwiseComps; - } - - private List addPairwiseMatches(List comps, - SAMRecord read1, SAMRecord read2, - List uniqueReads ) { - if ( compareToUniqueReads ) { - // we want to compare to a read in the unique read set - if ( uniqueReads.size() > 0 ) { // there's actually something to compare to - SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one - return pairwiseMatches(comps, read1, uniqueRead); - } else { - return comps; - } - } else { - // default, just do read1 vs. read2 - return pairwiseMatches(comps, read1, read2); - } - } - - /** - * Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list. - * Doesn't contain any logic deciding what to compare, just does read1 and read2 - * - * @param comps - * @param read1 - * @param read2 - * @return - */ - private List pairwiseMatches(List comps, SAMRecord read1, SAMRecord read2 ) { - byte[] read1Bases = read1.getReadBases(); - byte[] read1Quals = read1.getBaseQualities(); - byte[] read2Bases = read2.getReadBases(); - byte[] read2Quals = read2.getBaseQualities(); - - for ( int i = 0; i < read1Bases.length; i++) { - byte qual1 = read1Quals[i]; - byte qual2 = read2Quals[i]; - boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]); - DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP); - comps.add(dc); - } - - return comps; - } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/HapmapPoolAllelicInfoWalker.java b/archive/java/src/org/broadinstitute/sting/HapmapPoolAllelicInfoWalker.java deleted file mode 100755 index 39ca5f95a..000000000 --- a/archive/java/src/org/broadinstitute/sting/HapmapPoolAllelicInfoWalker.java +++ /dev/null @@ -1,193 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; -import org.broadinstitute.sting.utils.genotype.Variation; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.*; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Nov 12, 2009 - * Time: 12:31:58 PM - * To change this template use File | Settings | File Templates. - */ -public class HapmapPoolAllelicInfoWalker extends LocusWalker { - @Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true) - public String outputFileString = null; - @Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true) - public int poolSize = -1; - @Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true) - public String sampleNameFile = null; - @Argument(fullName="minCallQuality", shortName="q", doc="Ignore calls with below this quality, defaults to -1") - public double minCallQ = -1; - - private PrintWriter output; - private static double EPSILON = Math.pow(10,-4); - private String[] sampleNames = null; - private PowerBelowFrequencyWalker powerWalker = null; - private ConcordanceTruthTable ctt = null; - - public void initialize() { - sampleNames = generateNameTableFromFile(sampleNameFile); - powerWalker = new PowerBelowFrequencyWalker(); - powerWalker.initialize(); - powerWalker.setPoolSize(poolSize); - ctt = new ConcordanceTruthTable(poolSize); - } - - public PrintWriter reduceInit() { - try { - output = new PrintWriter(outputFileString); - } catch (FileNotFoundException e) { - throw new StingException("File "+outputFileString+" could not be opened.", e); - } - output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Num_Chips","Depth","Power","Support","Called"); - //System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called"); - return output; - } - - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - GenomeLoc loc = context.getLocation(); - String chrom = loc.getContig(); - long pos = loc.getStart(); - char refBase = Character.toUpperCase(ref.getBase()); - List> chips = getChips(sampleNames, tracker); - Pair> alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase); - char alternate; - if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) { - //System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases()); - alternate = getAlternateBase(chips,refBase); - - } else { - return null; // early return - } - int numVariantAllele = alleleFreqInfo.getSecond().getFirst(); - int numChipsObserved = alleleFreqInfo.getSecond().getSecond(); - int depth = context.size(); - double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele); - int called; - - Variation call = tracker.lookup("calls",Variation.class); - if ( call == null ) { - called = 0; - } else if ( call.isReference() || call.getNegLog10PError() < minCallQ-EPSILON ) { - called = 0; - } else { - called = 1; - } - - ReadBackedPileup p = context.getPileup(); - int support = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)]; - - // sanity check - if ( refBase == alternate ) { - if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) { - ;//logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases()); - } - } - - return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,numChipsObserved,depth,power,support,called); - - } - - public char getAlternateBase(List> chips, char ref) { - for ( Pair chip : chips ) { - Genotype g = chip.first; - char[] bases = g.getBases().toCharArray(); - if ( Character.toUpperCase(bases[0]) != ref ) - return bases[0]; - if ( Character.toUpperCase(bases[1]) != ref ) - return bases[1]; - } - return ref; - } - - public PrintWriter reduce(String s, PrintWriter p) { - if ( s == null ) { - // do nothing - return p; - } else { - //System.out.printf("%s%n",s); - output.printf("%s%n",s); - return p; - } - } - - public void onTraversalDone(PrintWriter p) { - output.close(); - } - - private List> getChips(String[] rodNames, RefMetaDataTracker tracker) { - List> chips = new ArrayList >(rodNames.length); - for ( String name : rodNames ) { - List rods = tracker.getReferenceMetaData(name); - Variation chip = (rods.size() == 0 ? null : (Variation)rods.get(0)); - if ( chip != null ) { - // chips must be Genotypes - if ( !(chip instanceof VariantBackedByGenotype) ) - throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data"); - chips.add(new Pair(((VariantBackedByGenotype)chip).getCalledGenotype(),null)); - } - } - - return chips; - } - // private methods for reading in names from a file - - private String[] generateNameTableFromFile(String file) { - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(file)); - } catch( FileNotFoundException e) { - String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath."; - throw new StingException(errMsg, e); - } - - LinkedList nameList = new LinkedList(); - - while(continueReading(reader)) { - String line = readLine(reader); - nameList.add(line); - } - - return nameList.toArray(new String[nameList.size()]); - } - - private boolean continueReading(BufferedReader reader) { - boolean continueReading; - try { - continueReading = reader.ready(); - } catch(IOException e) { - continueReading = false; - } - return continueReading; - } - - private String readLine(BufferedReader reader) { - String line; - try { - line = reader.readLine(); - } catch( IOException e) { - String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it."; - throw new StingException(errMsg,e); - } - return line; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/annotations/QualityAdjustedSecondBaseLod.java b/archive/java/src/org/broadinstitute/sting/annotations/QualityAdjustedSecondBaseLod.java deleted file mode 100644 index 4bb323d99..000000000 --- a/archive/java/src/org/broadinstitute/sting/annotations/QualityAdjustedSecondBaseLod.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; - -import java.util.Map; -import java.util.HashMap; -import java.util.List; -import java.util.Arrays; - -public class QualityAdjustedSecondBaseLod implements InfoFieldAnnotation, ExperimentalAnnotation { - private final String KEY_NAME = "Qual_Adjusted_2blod"; - private final double CHI_LOD_MAX = -1000.0; - private final SecondBaseSkew skewCalc = new SecondBaseSkew(); - private final double log10e = Math.log10(Math.E); - private final double log10half = Math.log10(1.0/2); - - public List getKeyNames() { return Arrays.asList(KEY_NAME); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Adjusted residual quality based on second-base skew")); } - - public Map annotate( RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, VariantContext vc) { - String chi = skewCalc.getAnnotation(ref, contexts, vc); - if ( chi == null ) - return null; - double chi_square = Double.valueOf(chi); - double chi_loglik = chi_square <= 0.0 ? 0.0 : Math.max(-(chi_square/2.0)*log10e + log10half,CHI_LOD_MAX); // cap it... - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%f", 10*(vc.getNegLog10PError() + chi_loglik))); - return map; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/annotations/SecondBaseSkew.java b/archive/java/src/org/broadinstitute/sting/annotations/SecondBaseSkew.java deleted file mode 100755 index fe1122a1e..000000000 --- a/archive/java/src/org/broadinstitute/sting/annotations/SecondBaseSkew.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; - -import java.util.Map; -import java.util.HashMap; -import java.util.List; -import java.util.Arrays; - - -public class SecondBaseSkew implements InfoFieldAnnotation, ExperimentalAnnotation { - private final static double epsilon = Math.pow(10.0,-12.0); - private final static String KEY_NAME = "2b_Chi"; - private final static double[] UNIFORM_ON_OFF_RATIO = {1.0/3.0, 2.0/3.0}; - private double[] proportionExpectations = UNIFORM_ON_OFF_RATIO; - - public List getKeyNames() { return Arrays.asList(KEY_NAME); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Chi-square Secondary Base Skew")); } - - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - String annotation = getAnnotation(ref, stratifiedContexts, vc); - if ( annotation == null ) - return null; - Map map = new HashMap(); - map.put(getKeyNames().get(0), annotation); - return map; - } - - public String getAnnotation(ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.isBiallelic() || !vc.isSNP() ) - return null; - - char alternate = vc.getAlternateAllele(0).toString().charAt(0); - - Pair depth = new Pair(0, 0); - for ( String sample : stratifiedContexts.keySet() ) { - //Pair sampleDepth = getSecondaryPileupNonrefCount(ref.getBase(),stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(), alternate); - Pair sampleDepth = getSecondaryPileupNonrefCount(ref.getBaseAsChar(), stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), alternate); - depth.first += sampleDepth.first; - depth.second += sampleDepth.second; - } - - if ( depth.first == 0 ) - return null; - - double biasedProportion = (1.0 + depth.second) / (1.0 + depth.first); - double p_transformed = transform(biasedProportion, depth.first+1); - double expected_transformed = transform(proportionExpectations[0], depth.first+1); - double chi_square = Math.signum(biasedProportion - proportionExpectations[0])*Math.min(Math.pow(p_transformed - expected_transformed, 2), Double.MAX_VALUE); - return String.format("%f", chi_square); - } - - private double transform( double proportion, int depth ) { - proportion = proportion - epsilon; - return proportion / ( Math.sqrt ( proportion*(1-proportion)/depth ) ); - } - - private Pair getSecondaryPileupNonrefCount(char ref, ReadBackedPileup p, char snp ) { - int variantDepth = 0; - int variantsWithRefSecondBase = 0; - - for (PileupElement pile : p ) { - byte pbase = pile.getBase(); - byte sbase = pile.getSecondBase(); - - if ( BaseUtils.isRegularBase((char)sbase) && BaseUtils.basesAreEqual(pbase, (byte) snp) ) { - variantDepth++; - if ( BaseUtils.basesAreEqual(sbase, (byte)ref) ) { - variantsWithRefSecondBase++; - } - } - } - - return new Pair(variantDepth, variantsWithRefSecondBase); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/concordance/CallsetConcordanceWalker.java b/archive/java/src/org/broadinstitute/sting/concordance/CallsetConcordanceWalker.java deleted file mode 100755 index bd9210f22..000000000 --- a/archive/java/src/org/broadinstitute/sting/concordance/CallsetConcordanceWalker.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.concordance; - -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.classloader.PackageUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.genotype.vcf.*; - -import java.io.File; -import java.util.*; - - -/** - * Determines the concordance between multiple VCF call sets at each position. - * Users can specify which concordance tests should be run. - */ -@Requires(value={DataSource.REFERENCE}) -@Reference(window=@Window(start=-20,stop=20)) -public class CallsetConcordanceWalker extends RodWalker { - @Argument(fullName="concordance_output", shortName="CO", doc="VCF file to which output should be written", required=true) - private File OUTPUT = null; - @Argument(fullName="concordanceType", shortName="CT", doc="Concordance subset types to apply to given callsets. Syntax: 'type[:key1=arg1,key2=arg2,...]'", required=false) - private String[] TYPES = null; - @Argument(fullName="list", shortName="ls", doc="List the available concordance types and exit", required=false) - private Boolean LIST_ONLY = false; - - - // the concordance tests to run - private ArrayList requestedTypes; - - // VCF writer for the output of the concordance tests - private VCFWriter vcfWriter; - - // a map of rod name to uniquified sample name - private HashMap, String> rodNamesToSampleNames = new HashMap, String>(); - - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - // get the possible concordance types - List> classes = PackageUtils.getClassesImplementingInterface(ConcordanceType.class); - - // print and exit if that's what was requested - if ( LIST_ONLY ) { - out.println("\nAvailable concordance types:"); - for (int i = 0; i < classes.size(); i++) - out.println("\t" + classes.get(i).getSimpleName()); - out.println(); - System.exit(0); - } - - // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) - HashSet samples = new HashSet(); - SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); - - for ( java.util.Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { - logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue()); - } - - // initialize requested concordance types - requestedTypes = new ArrayList(); - if (TYPES != null) { - for ( String requestedTypeString : TYPES ) { - String[] requestedPieces = requestedTypeString.split(":"); - String requestedType = requestedPieces[0]; - - boolean foundClass = false; - for ( Class type : classes ) { - - if (requestedType.equalsIgnoreCase(type.getSimpleName())) { - foundClass = true; - try { - ConcordanceType concordance = (ConcordanceType)type.newInstance(); - HashMap requestedArgs = new HashMap(); - if ( requestedPieces.length == 2 ) { - String[] argStrings = requestedPieces[1].split(","); - for (int i = 0; i < argStrings.length; i++ ) { - String[] arg = argStrings[i].split("="); - if ( arg.length == 2 ) - requestedArgs.put(arg[0], arg[1]); - } - } - - concordance.initialize(requestedArgs, samples); - requestedTypes.add(concordance); - break; - } catch (InstantiationException e) { - throw new StingException(String.format("Cannot instantiate concordance class '%s': must be concrete class", type.getSimpleName())); - } catch (IllegalAccessException e) { - throw new StingException(String.format("Cannot instantiate concordance class '%s': must have no-arg constructor", type.getSimpleName())); - } - } - } - - if ( !foundClass ) - throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid concordance option"); - } - } - - // set up the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "CallsetConcordance")); - hInfo.add(new VCFHeaderLine("note", "\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\"")); - hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes)); - - vcfWriter = new VCFWriter(OUTPUT); - vcfWriter.writeHeader(new VCFHeader(hInfo, samples)); - } - - public static Set getVCFAnnotationDescriptions(Collection types) { - - TreeSet descriptions = new TreeSet(); - for ( ConcordanceType type : types ) - descriptions.add(type.getInfoDescription()); - - return descriptions; - } - - public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { - if ( rodData == null ) // RodWalkers can make funky map calls - return 0; - - // get all of the vcf rods at this locus - Map vcfRods = new LinkedHashMap(); - Iterator rods = rodData.getAllRods().iterator(); - while (rods.hasNext()) { - GATKFeature rod = rods.next(); - if ( rod.getUnderlyingObject() instanceof VCFRecord ) { - if (vcfRods.containsKey(rod)) throw new StingException("Duplicate VCF's found"); - vcfRods.put((VCFRecord)rod.getUnderlyingObject(),rod.getName()); - } - } - - if ( vcfRods.size() == 0 ) - return 0; - - // pull out all of the individual calls from the rods and insert into a map based on the - // mapping from rod/sample to uniquified name - HashMap samplesToRecords = new HashMap(); - for ( VCFRecord rod : vcfRods.keySet() ) { - List records = rod.getVCFGenotypeRecords(); - for ( VCFGenotypeRecord vcfRec : records ) { - String uniquifiedSample = rodNamesToSampleNames.get(new Pair(vcfRods.get(rod), vcfRec.getSampleName())); - if ( uniquifiedSample == null ) - throw new StingException("Unexpected sample encountered: " + vcfRec.getSampleName() + " in rod " + vcfRods.get(rod)); - - samplesToRecords.put(uniquifiedSample, vcfRec); - } - } - - // create a merged record from all input VCFs - VCFRecord record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames); - - // add in the info fields to the new record based on the results of each of the relevant concordance tests - for ( ConcordanceType type : requestedTypes ) { - String result = type.computeConcordance(samplesToRecords, ref); - if ( result != null ) { - record.addInfoField(type.getInfoName(), result); - } - } - - // emit the new record - vcfWriter.addRecord(record); - - return 1; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - vcfWriter.close(); - out.printf("Processed %d loci.\n", result); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/concordance/ConcordanceType.java b/archive/java/src/org/broadinstitute/sting/concordance/ConcordanceType.java deleted file mode 100755 index 124da2d46..000000000 --- a/archive/java/src/org/broadinstitute/sting/concordance/ConcordanceType.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.concordance; - -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; - -import java.util.Map; -import java.util.Set; - -public interface ConcordanceType { - - public void initialize(Map args, Set samples); - public String computeConcordance(Map samplesToRecords, ReferenceContext ref); - public String getInfoName(); - public VCFInfoHeaderLine getInfoDescription(); -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/concordance/SNPGenotypeConcordance.java b/archive/java/src/org/broadinstitute/sting/concordance/SNPGenotypeConcordance.java deleted file mode 100755 index fd8cdca8f..000000000 --- a/archive/java/src/org/broadinstitute/sting/concordance/SNPGenotypeConcordance.java +++ /dev/null @@ -1,121 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.concordance; - -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.StingException; - -import java.util.*; - -/** - * Split up two call sets into their various concordance sets - */ -public class SNPGenotypeConcordance implements ConcordanceType { - - private double Qscore = 30.0; - - private String sample1, sample2; - - public SNPGenotypeConcordance() {} - - public void initialize(Map args, Set samples) { - if ( samples.size() != 2 ) - throw new StingException("SNPGenotype concordance test cannot handle anything other than 2 VCF records"); - - if ( args.get("qscore") != null ) - Qscore = Double.valueOf(args.get("qscore")); - - Iterator iter = samples.iterator(); - sample1 = iter.next(); - sample2 = iter.next(); - } - - public String computeConcordance(Map samplesToRecords, ReferenceContext ref) { - char refBase = ref.getBaseAsChar(); - - VCFGenotypeRecord call1 = samplesToRecords.get(sample1); - if ( call1 != null && call1.isNoCall() ) - call1 = null; - VCFGenotypeRecord call2 = samplesToRecords.get(sample2); - if ( call2 != null && call2.isNoCall() ) - call2 = null; - - if ( call1 == null || call2 == null ) { - if ( call1 != null && call1.isPointGenotype() && call1.isVariant(refBase) ) { - if ( 10.0 * call1.getNegLog10PError() >= Qscore ) - return "set1ConfidentSet2NoCall"; - else - return "set2NoCall"; - } - else if ( call2 != null && call2.isPointGenotype() && call2.isVariant(refBase) ) { - if (10.0 * call2.getNegLog10PError() >= Qscore ) - return "set1NoCallSet2Confident"; - else - return "set1NoCall"; - } - return null; - } - - // if either is an indel, skip this site - if ( !call1.isPointGenotype() || !call2.isPointGenotype() ) - return null; - - double confidence1 = 10.0 * call1.getNegLog10PError(); - double confidence2 = 10.0 * call2.getNegLog10PError(); - String genotype1 = call1.getBases(); - String genotype2 = call2.getBases(); - - // are they both SNPs? - boolean call1IsVariant = call1.isVariant(refBase); - boolean call2IsVariant = call2.isVariant(refBase); - if ( call1IsVariant && call2IsVariant ) { - - // are they confident calls? - boolean conf1 = confidence1 >= Qscore; - boolean conf2 = confidence2 >= Qscore; - boolean confCombo = !conf1 && !conf2 && confidence1 + confidence2 >= Qscore; - - StringBuffer result = new StringBuffer(""); - if ( conf1 && conf2 ) - result.append("bothConfident"); - else if ( confCombo ) - result.append("confidentWhenCombined"); - else if ( conf1 ||conf2 ) - result.append("onlyOneConfident"); - else - result.append("neitherConfident"); - - result.append("_"); - - // are they the same genotype - if ( genotype1.equals(genotype2) ) - result.append("sameGenotype"); - else if ( sameVariantAllele(genotype1, genotype2, ref.getBaseAsChar()) ) - result.append("differentGenotypeSameVariantAllele"); - else - result.append("differentVariantAllele"); - - return result.toString(); - } - - // one is variant and the other is ref - else if ( call1IsVariant ) - return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "VariantSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Ref"; - else if ( call2IsVariant ) - return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "RefSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Variant"; - - return null; - } - - private boolean sameVariantAllele(String genotype1, String genotype2, char ref) { - if ( genotype1.length() < 2 || genotype2.length() < 2 ) - return genotype1.equals(genotype2); - char altAllele1 = genotype1.charAt(0) != ref ? genotype1.charAt(0) : genotype1.charAt(1); - char altAllele2 = genotype2.charAt(0) != ref ? genotype2.charAt(0) : genotype2.charAt(1); - return altAllele1 == altAllele2; - } - - public String getInfoName() { return "SnpConcordance"; } - public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "SNP concordance test"); } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFReader.java b/archive/java/src/org/broadinstitute/sting/glf/GLFReader.java deleted file mode 100644 index 4032faca0..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFReader.java +++ /dev/null @@ -1,240 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import net.sf.samtools.util.BinaryCodec; -import net.sf.samtools.util.BlockCompressedInputStream; -import net.sf.samtools.util.RuntimeEOFException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotype.LikelihoodObject; - -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.util.Iterator; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** an object for reading in GLF files */ -// TODO -- DELETE ME GLF -public class GLFReader implements Iterator { - - // our next record - private GLFRecord nextRecord = null; - - // the glf magic number, which identifies a properly formatted GLF file - public static final short[] glfMagic = {'G', 'L', 'F', '\3'}; - - // our input codec - private final BinaryCodec inputBinaryCodec; - - // our header string - private String headerStr; - - // our reference name - private String referenceName; - - // reference length - private int referenceLength; - - // the current location, keeping track of the offsets - private long currentLocation = 1; - - // we have this variable becuase there is no eof for glf's - private int lastRecordType = -1; - - private File myFile; - - /** - * create a glf reader - * - * @param readFrom the file to read from - */ - public GLFReader(File readFrom) { - myFile = readFrom; - - try { - inputBinaryCodec = new BinaryCodec(new DataInputStream(new BlockCompressedInputStream(readFrom))); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(myFile, e); - } - - inputBinaryCodec.setInputFileName(readFrom.getName()); - - // first verify that it's a valid GLF - for (short s : glfMagic) { - if (inputBinaryCodec.readUByte() != s) - throw new UserException.MalformedFile(myFile, "Verification of GLF format failed: magic string doesn't match)"); - } - - // get the header string - headerStr = inputBinaryCodec.readLengthAndString(false); - - if (advanceContig()) { - // setup the next record - next(); - } - - } - - /** - * read in a single point call - * - * @param refBase the reference base - * @param inputBinaryCodec the binary codec - * - * @return a single point call object - */ - private GLFSingleCall generateSPC(char refBase, BinaryCodec inputBinaryCodec) { - int offset = (int) inputBinaryCodec.readUInt(); - long depth = inputBinaryCodec.readUInt(); - short min_lk = (short) ((depth & 0x00000000ff000000) >> 24); - int readDepth = (int) (depth & 0x0000000000ffffff); - short rmsMapping = inputBinaryCodec.readUByte(); - double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length]; - for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) { - lkValues[x] = ((double)inputBinaryCodec.readUByte() / GLFRecord.LIKELIHOOD_SCALE_FACTOR + (double)min_lk); - } - return new GLFSingleCall(referenceName, refBase, (int)(offset+currentLocation), readDepth, rmsMapping, lkValues); - } - - /** - * read in a variable length call, and generate a VLC object from the data - * - * @param refBase the reference base - * @param inputBinaryCodec the input codex - * - * @return a GLFVariableLengthCall object - */ - private GLFVariableLengthCall generateVLC(char refBase, BinaryCodec inputBinaryCodec) { - int offset = (int) inputBinaryCodec.readUInt(); - int depth = (int) inputBinaryCodec.readUInt(); - short min_lk = (short) ((depth & 0x00000000ff000000) >> 24); - int readDepth = (depth & 0x0000000000ffffff); - short rmsMapping = inputBinaryCodec.readUByte(); - short lkHom1 = inputBinaryCodec.readUByte(); - short lkHom2 = inputBinaryCodec.readUByte(); - short lkHet = inputBinaryCodec.readUByte(); - int indelLen1 = (int) inputBinaryCodec.readShort(); - int indelLen2 = (int) inputBinaryCodec.readShort(); - - int readCnt = Math.abs(indelLen1); - short indelSeq1[] = new short[readCnt]; - for (int x = 0; x < readCnt; x++) { - indelSeq1[x] = inputBinaryCodec.readUByte(); - } - readCnt = Math.abs(indelLen2); - short indelSeq2[] = new short[readCnt]; - for (int x = 0; x < readCnt; x++) { - indelSeq2[x] = inputBinaryCodec.readUByte(); - } - return new GLFVariableLengthCall(referenceName, refBase, offset+currentLocation, readDepth, rmsMapping, lkHom1, lkHom2, lkHet, indelLen1, indelSeq1, indelLen2, indelSeq2); - } - - public boolean hasNext() { - return (nextRecord != null); - } - - public GLFRecord next() { - GLFRecord ret = nextRecord; - short firstBase = protectedByteReadForFile(); - if (firstBase == -1) return ret; - - // parse out the record type and reference base - byte recordType = (byte) ((firstBase & 0x0f0) >> 4); - char refBase = (char) (firstBase & 0x000f); - lastRecordType = recordType; - - if (recordType == 1) { - nextRecord = generateSPC(refBase, inputBinaryCodec); - } else if (recordType == 2) { - nextRecord = generateVLC(refBase, inputBinaryCodec); - } else if (recordType == 0) { - if (advanceContig()) { - return next(); - } - //nextRecord = null; - } else { - throw new UserException.MalformedFile(myFile, "Unknown GLF record type (type = " + recordType + ")"); - } - if (nextRecord != null) currentLocation = nextRecord.getPosition(); - return ret; - } - - /** - * read a short, and if we see an exception only throw it if it's unexpected (not after a zero) - * @return a short - */ - private short protectedByteReadForFile() { - short st = -1; - try { - st = inputBinaryCodec.readUByte(); - } catch (RuntimeEOFException exp) { - nextRecord = null; - if (lastRecordType != 0) { - throw exp; // if the last record was a zero, this is an ok condition. Otherwise throw an exception - } - } - return st; - } - - /** - * advance to the next contig - * - * @return true if we could advance - */ - private boolean advanceContig() { - // try to read the next sequence record - try { - // get the reference name - referenceName = inputBinaryCodec.readLengthAndString(true); - - // get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it. - referenceLength = (int) inputBinaryCodec.readUInt(); - //System.err.println(referenceName.length()); - currentLocation = 1; - return true; - } catch (RuntimeException e) { - if (lastRecordType != 0) { - throw e; // if the last record was a zero, this is an ok condition. Otherwise throw an exception - } - nextRecord = null; - } - return false; - } - - public void remove() { - throw new ReviewedStingException("GLFReader doesn't support remove()"); - } - - public void close() { - inputBinaryCodec.close(); - } - - public String getHeaderStr() { - return headerStr; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFReaderUnitTest.java b/archive/java/src/org/broadinstitute/sting/glf/GLFReaderUnitTest.java deleted file mode 100644 index f5575d956..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFReaderUnitTest.java +++ /dev/null @@ -1,53 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import org.broadinstitute.sting.BaseTest; - -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - - -/** - * @author aaron - *

- * Class GLFReaderUnitTest - *

- * A descriptions should go here. Blame aaron if it's missing. - */ -public class GLFReaderUnitTest extends BaseTest { - - - // our test file - static final File glfFile = new File(validationDataLocation + "index_test_likelihoods.glf"); - //static final File glfFile = new File("CALLS.glf"); - static final int finalRecordCount = 484140; // the number of records in the above file - static final int contigCount = 25; - - /** read in the records from the file */ - @Test - public void testReadRecords() { - int recCount = 0; - List contigs = new ArrayList(); - try { - GLFReader reader = new GLFReader(glfFile); - long location = 1; - while (reader.hasNext()) { - GLFRecord rec = reader.next(); - if (!contigs.contains(rec.getContig())) { - contigs.add(rec.getContig()); - } - location = rec.getPosition(); - //System.err.println("Record count = " + finalRecordCount + " offset " + rec.offset + " location = " + location + " type = " + rec.getRecordType()); - ++recCount; - } - } catch (Exception e) { - System.err.println("Record count = " + recCount); - e.printStackTrace(); - } - Assert.assertEquals(recCount, finalRecordCount); - Assert.assertEquals(contigs.size(), contigCount); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFRecord.java b/archive/java/src/org/broadinstitute/sting/glf/GLFRecord.java deleted file mode 100755 index 77fbbfa09..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFRecord.java +++ /dev/null @@ -1,306 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import net.sf.samtools.util.BinaryCodec; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class RecordType - *

- * The base record type for all GLF entries. Each record has a number of fields - * common to the record set. This is also the source of the REF_BASE enumeration, - * which represents the accepted FASTA nucleotide symbols and their assocated GLF - * field values. - */ -// TODO -- DELETE ME GLF -public abstract class GLFRecord { - public final static double LIKELIHOOD_SCALE_FACTOR = 10; - - - // fields common to all records - protected String contig; - protected REF_BASE refBase; - protected long position = 1; - protected int readDepth = 0; - protected short rmsMapQ = 0; - - /** the reference base enumeration, with their short (type) values for GLF */ - public enum REF_BASE { - X((short) 0x00), - A((short) 0x01), - C((short) 0x02), - M((short) 0x03), - G((short) 0x04), - R((short) 0x05), - S((short) 0x06), - V((short) 0x07), - T((short) 0x08), - W((short) 0x09), - Y((short) 0x0A), - H((short) 0x0B), - K((short) 0x0C), - D((short) 0x0D), - B((short) 0x0E), - N((short) 0x0F); - - private final short fieldValue; - - /** - * private constructor, used by the enum class to makes each enum value - * - * @param value the short values specified in the enum listing - */ - REF_BASE(short value) { - fieldValue = value; - } - - /** - * return the character representation - * - * @return the char for the reference base - */ - public char toChar() { - return this.toString().charAt(0); - } - - /** - * static method from returning a REF_BASE given the character representation - * - * @param value the character representation of a REF_BASE - * - * @return the corresponding REF_BASE - * @throws IllegalArgumentException if the value passed can't be converted - */ - public static REF_BASE toBase(char value) { - // for the case where they're passing in the enumated value - if (value <= 0x0F && value >= 0) { - return REF_BASE.values()[value]; - } - String str = String.valueOf(value).toUpperCase(); - for (int x = 0; x < REF_BASE.values().length; x++) { - if (REF_BASE.values()[x].toString().equals(str)) { - return REF_BASE.values()[x]; - } - } - throw new IllegalArgumentException("Counldn't find matching reference base for " + str); - } - - /** @return the hex value of the given REF_BASE */ - public short getBaseHexValue() { - return fieldValue; - } - } - - /** the record type enum, which enumerates the different records we can have in a GLF */ - public enum RECORD_TYPE { - SINGLE((short) 1), - VARIABLE((short) 2); - - private final short fieldValue; // a place to store the type - - RECORD_TYPE(short value) { - fieldValue = value; - } - - public short getReadTypeValue() { - return fieldValue; - } - } - - - /** - * Constructor, given the base a character reference base - * - * @param contig the contig string - * @param base the reference base in the reference - * @param position the distance from the beginning of the reference seq - * @param readDepth the read depth at this position - * @param rmsMapQ the root mean square of the mapping quality - */ - public GLFRecord(String contig, char base, long position, int readDepth, short rmsMapQ) { - REF_BASE newBase = REF_BASE.toBase(base); - validateInput(contig, newBase, position, readDepth, rmsMapQ); - } - - /** - * Constructor, given the base a REF_BASE - * - * @param contig the contig string - * @param base the reference base in the reference - * @param position the distance from the beginning of the reference seq - * @param readDepth the read depth at this position - * @param rmsMapQ the root mean square of the mapping quality - */ - GLFRecord(String contig, REF_BASE base, long position, int readDepth, short rmsMapQ) { - validateInput(contig, base, position, readDepth, rmsMapQ); - } - - /** - * validate the input during construction, and store valid values - * - * @param chromosome the reference contig, as a String - * @param base the reference base in the reference, as a REF_BASE - * @param position the distance from the beginning of the reference seq - * @param readDepth the read depth at this position - * @param rmsMapQ the root mean square of the mapping quality - */ - private void validateInput(String chromosome, REF_BASE base, long position, int readDepth, short rmsMapQ) { - // add any validation to the contig string here - this.contig = chromosome; - - this.refBase = base; - - if (position > 4294967295L || position < 0) { - throw new IllegalArgumentException("Position is out of bounds (0 to 0xffffffff) value passed = " + position); - } - this.position = position; - -// if (minimumLikelihood > 255 || minimumLikelihood < 0) { -// throw new IllegalArgumentException("minimumLikelihood is out of bounds (0 to 0xffffffff) value passed = " + minimumLikelihood); -// } -// this.minimumLikelihood = GLFRecord.toCappedShort(minimumLikelihood); - - if (readDepth > 16777215 || readDepth < 0) { - throw new IllegalArgumentException("readDepth is out of bounds (0 to 0xffffff) value passed = " + readDepth); - } - this.readDepth = readDepth; - - if (rmsMapQ > 255 || rmsMapQ < 0) { - throw new IllegalArgumentException("rmsMapQ is out of bounds (0 to 0xff) value passed = " + rmsMapQ); - } - this.rmsMapQ = rmsMapQ; - } - - /** - * write the this record to a binary codec output. - * - * @param out the binary codec to write to - * @param lastRecord the record to write - */ - void write(BinaryCodec out, GLFRecord lastRecord) { - long offset; - if (lastRecord != null && lastRecord.getContig().equals(this.getContig())) - offset = this.position - lastRecord.getPosition(); - else - offset = this.position - 1; // we start at one, we need to subtract that off - out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); - out.writeUInt(((Long) (offset)).intValue()); // we have to subtract one, we're an offset - long write = ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24); - out.writeUInt(write); - out.writeUByte(rmsMapQ); - } - - /** - * get the record type - * - * @return the record type enumeration - */ - public abstract RECORD_TYPE getRecordType(); - - /** - * Return the size of this record in bytes. - * - * @return the size of this record type, in bytes - */ - public int getByteSize() { - return 10; // the record type field (1), offset (4), the min depth field (4), and the rms mapping (1) - } - - /** - * convert a double to a byte, capping it at the maximum value of 255 - * - * @param d a double value - * - * @return a byte, capped at - */ - protected static short toCappedShort(double d) { - return (d > 255.0) ? (short) 255 : (short) Math.round(d); - } - - /** - * find the minimum value in a set of doubles - * - * @param vals the array of values - * - * @return the minimum value - */ - protected static double findMin(double vals[]) { - if (vals.length < 1) throw new ReviewedStingException("findMin: an array of size < 1 was passed in"); - - double min = vals[0]; - for (double d : vals) - if (d < min) min = d; - - return min; - } - - public REF_BASE getRefBase() { - return refBase; - } - - public long getPosition() { - return position; - } - - public short getMinimumLikelihood() { - return calculateMinLikelihood(); - } - - public int getReadDepth() { - return readDepth; - } - - public short getRmsMapQ() { - return rmsMapQ; - } - - public String getContig() { - return this.contig; - } - - /** - * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event - * that the ML is above 255. In this case the records need to scale the value appropriately, and warn the users. - * @return a short of the minimum likelihood. - */ - protected abstract short calculateMinLikelihood(); - - public boolean equals(GLFRecord rec) { - return (rec != null) && - contig.equals(rec.getContig()) && - (refBase == rec.getRefBase()) && - (position == rec.getPosition()) && - (readDepth == rec.getReadDepth()) && - (rmsMapQ == rec.getRmsMapQ()); - } - - -} - diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFRecordUnitTest.java b/archive/java/src/org/broadinstitute/sting/glf/GLFRecordUnitTest.java deleted file mode 100644 index c4e9ba9cc..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFRecordUnitTest.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; - -import org.testng.annotations.Test; - - -/** - * - * @author aaron - * - * Class GLFRecordUnitTest - * - * Test out the basics of a GLFRecord - */ -public class GLFRecordUnitTest extends BaseTest { - - @Test - public void testConstructingGLFRecord() { - double likelihoods[] = new double[10]; - for (int i = 0; i < 10; i++) { - likelihoods[i] = 10.0; - } - GLFRecord rec = new GLFSingleCall("1",'A',1,100,(short)200,likelihoods); - - Assert.assertTrue("1".equals(rec.contig)); - Assert.assertEquals(rec.getRefBase().toChar(), 'A'); - Assert.assertEquals(rec.getPosition(), 1); - Assert.assertEquals(rec.getMinimumLikelihood(), 10); - Assert.assertEquals(rec.getRmsMapQ(), 200); - Assert.assertEquals(rec.getReadDepth(), 100); - - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFSingleCall.java b/archive/java/src/org/broadinstitute/sting/glf/GLFSingleCall.java deleted file mode 100644 index 6bdd9ffc7..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFSingleCall.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import net.sf.samtools.util.BinaryCodec; -import org.broadinstitute.sting.utils.Utils; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class GLFSingleCall - *

- * This class represents a single point geneotype call in GLF vernacular - */ -// TODO -- DELETE ME GLF -public class GLFSingleCall extends GLFRecord { - - // our likelihoods array - private double likelihoods[]; - private double minLikelihood; - /** - * create a single - * - * @param contig the contig this record is on - * @param refBase the reference base, as a char - * @param position the location, as an offset from the start of the contig - * @param readDepth the read depth at the specified postion - * @param rmsMapQ the root mean square of the mapping quality - * @param likelihoods the Likelihoods - */ - public GLFSingleCall(String contig, char refBase, int position, int readDepth, short rmsMapQ, double likelihoods[]) { - super(contig, refBase, position, readDepth, rmsMapQ); - minLikelihood = GLFRecord.findMin(likelihoods); - this.likelihoods = likelihoods; - } - - - /** - * Write out the record to a binary codec - * - * @param out the codec to write to - */ - void write(BinaryCodec out, GLFRecord lastRec) { - super.write(out, lastRec); - short[] adjusted = new short[likelihoods.length]; - - // we want to scale our values - for (int x = 0; x < likelihoods.length; x++) { - adjusted[x] = GLFRecord.toCappedShort(Math.round(LIKELIHOOD_SCALE_FACTOR * (likelihoods[x] - minLikelihood))); - } - try { - for (short value : adjusted) { - out.writeUByte(value); - } - } catch (Exception e) { - e.printStackTrace(); - } - } - - /** - * return the record type we represent, in this case SINGLE - * - * @return RECORD_TYPE.SINGLE - */ - public RECORD_TYPE getRecordType() { - return RECORD_TYPE.SINGLE; - } - - /** - * return our size in bytes - * - * @return number of bytes we represent - */ - public int getByteSize() { - return likelihoods.length + super.getByteSize(); - } - - /** - * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event - * that the ML is above 255. In this case the records need to scale their likelihood values appropriately, and warn the user. - * - * @return a short of the minimum likelihood. - */ - @Override - protected short calculateMinLikelihood() { - if (minLikelihood > 255.0) { - double scale = minLikelihood - 255.0; - this.minLikelihood = 255.0; - for (int x = 0; x < this.likelihoods.length; x++) - this.likelihoods[x] = this.likelihoods[x] - scale; - Utils.warnUser("GLFRecord: Locus " + this.getContig() + ":" + this.position + " had it's likelihood information scaled, the original likelihood values are unrecoverable"); - } - return toCappedShort(minLikelihood); - } - - @Override - public boolean equals(GLFRecord rec) { - if (!super.equals(rec)) return false; - if (!(rec instanceof GLFSingleCall)) return false; - if (((GLFSingleCall) rec).getLikelihoods().length != this.likelihoods.length) return false; - for (int x = 0; x < likelihoods.length; x++) - if (Double.compare(likelihoods[x],((GLFSingleCall) rec).getLikelihoods()[x]) != 0) return false; - return this.getMinimumLikelihood() == rec.getMinimumLikelihood(); - } - - public double[] getLikelihoods() { - return likelihoods; - } - - -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFVariableLengthCall.java b/archive/java/src/org/broadinstitute/sting/glf/GLFVariableLengthCall.java deleted file mode 100644 index 2f28d5fe5..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFVariableLengthCall.java +++ /dev/null @@ -1,175 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import net.sf.samtools.util.BinaryCodec; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class GLFVariableLengthCall - *

- * This class represents variable length genotype calls in the GLF format. - * Currently a lot of parameters need to be provided, but we may be able to thin - * those down as we understand what we have to specify and what we can infer. - */ -// TODO -- DELETE ME GLF -public class GLFVariableLengthCall extends GLFRecord { - // our fields, corresponding to the glf spec - private short lkHom1 = 0; - private short lkHom2 = 0; - private short lkHet = 0; - private int indelLen1 = 0; - private int indelLen2 = 0; - private final short indelSeq1[]; - private final short indelSeq2[]; - private short minlikelihood; - // our size, which is immutable, in bytes - private final int size; - - - /** - * the default constructor - * - * @param contig the contig this record is on - * @param refBase the reference base - * @param offset the location, as an offset from the previous glf record - * @param readDepth the read depth at the specified postion - * @param rmsMapQ the root mean square of the mapping quality - * @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255 - * @param lkHom2 the negitive log likelihood of the second homozygous indel allele, from 0 to 255 - * @param lkHet the negitive log likelihood of the heterozygote, from 0 to 255 - * @param indelSeq1 the sequence for the first indel allele - * @param indelSeq2 the sequence for the second indel allele - */ - public GLFVariableLengthCall(String contig, - char refBase, - long offset, - int readDepth, - short rmsMapQ, - double lkHom1, - double lkHom2, - double lkHet, - int indelOneLength, - final short indelSeq1[], - int indelTwoLength, - final short indelSeq2[]) { - super(contig, refBase, offset, readDepth, rmsMapQ); - this.lkHom1 = GLFRecord.toCappedShort(lkHom1); - this.lkHom2 = GLFRecord.toCappedShort(lkHom2); - this.lkHet = GLFRecord.toCappedShort(lkHet); - this.indelLen1 = indelOneLength; - this.indelLen2 = indelTwoLength; - this.indelSeq1 = indelSeq1; - this.indelSeq2 = indelSeq2; - size = 16 + indelSeq1.length + indelSeq2.length; - this.minlikelihood = GLFRecord.toCappedShort(findMin(new double[]{lkHom1, lkHom2, lkHet})); - } - - /** - * Write out the record to a binary codec - * - * @param out the binary codec to write to - */ - void write(BinaryCodec out, GLFRecord rec) { - super.write(out,rec); - out.writeByte(lkHom1); - out.writeByte(lkHom2); - out.writeByte(lkHet); - out.writeShort(new Integer(indelLen1).shortValue()); - out.writeShort(new Integer(indelLen2).shortValue()); - for (short anIndelSeq1 : indelSeq1) { - out.writeUByte(anIndelSeq1); - } - for (short anIndelSeq2 : indelSeq2) { - out.writeUByte(anIndelSeq2); - } - } - - /** @return RECORD_TYPE.VARIABLE */ - public RECORD_TYPE getRecordType() { - return RECORD_TYPE.VARIABLE; - } - - /** @return the size of the record, which is the size of our fields plus the generic records fields */ - public int getByteSize() { - return size + super.getByteSize(); - } - - /** - * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event - * that the ML is above 255. In this case the records need to scale the value appropriately, and warn the users. - * - * @return a short of the minimum likelihood. - */ - @Override - protected short calculateMinLikelihood() { - return minlikelihood; - } - - public short getLkHom1() { - return lkHom1; - } - - public short getLkHom2() { - return lkHom2; - } - - public short getLkHet() { - return lkHet; - } - - public short[] getIndelSeq1() { - return indelSeq1; - } - - public short[] getIndelSeq2() { - return indelSeq2; - } - - public int getIndelLen2() { - return indelLen2; - } - - public int getIndelLen1() { - return indelLen1; - } - - public boolean equals(GLFRecord rec) { - if (!super.equals(rec)) return false; - if (!(rec instanceof GLFVariableLengthCall)) return false; - if (lkHom1 != ((GLFVariableLengthCall) rec).getLkHom1()) return false; - if (lkHom2 != ((GLFVariableLengthCall) rec).getLkHom2()) return false; - if (lkHet != ((GLFVariableLengthCall) rec).getLkHet()) return false; - if (indelLen1 != ((GLFVariableLengthCall) rec).getIndelLen1()) return false; - if (indelLen2 != ((GLFVariableLengthCall) rec).getIndelLen2()) return false; - for (int x = 0; x < indelSeq1.length; x++) - if (indelSeq1[x] != ((GLFVariableLengthCall) rec).getIndelSeq1()[x]) return false; - for (int x = 0; x < indelSeq2.length; x++) - if (indelSeq2[x] != ((GLFVariableLengthCall) rec).getIndelSeq2()[x]) return false; - return minlikelihood == rec.getMinimumLikelihood() && size == rec.getByteSize(); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFWriter.java b/archive/java/src/org/broadinstitute/sting/glf/GLFWriter.java deleted file mode 100755 index 48569db9a..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFWriter.java +++ /dev/null @@ -1,259 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.util.BinaryCodec; -import net.sf.samtools.util.BlockCompressedOutputStream; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.genotype.IndelLikelihood; -import org.broadinstitute.sting.utils.genotype.LikelihoodObject; - -import java.io.DataOutputStream; -import java.io.File; -import java.io.OutputStream; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - * @version 1.0 - *

- * This class writes GLF files. You can either specify GLFRecords, or programaticly generate - * single and variable length genotype calls using the provided functions. When you've finished - * generating GLF records, make sure you close the file. - */ -// TODO -- DELETE ME GLF -public class GLFWriter { - // our output codec - private final BinaryCodec outputBinaryCodec; - - // the glf magic number, which identifies a properly formatted GLF file - public static final short[] glfMagic = {'G', 'L', 'F', '\3'}; - - // our header text, reference sequence name (i.e. chr1), and it's length - private String headerText = null; - private String referenceSequenceName = null; - private long referenceSequenceLength = 0; - - // we need to store the last record so we can calculate the offsets - private GLFRecord mLastRecord = null; - - // the last position written - private int lastPos = 1; - - // a field for storing the RMS of the mapping qualities in a mutable variant context - public static final String RMS_MAPPING_QUAL = "RMS_MAPPING_QUAL"; - - /** - * The public constructor for creating a GLF object - * - * @param writeTo the location to write to - */ - public GLFWriter(File writeTo) { - outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(writeTo))); - outputBinaryCodec.setOutputFileName(writeTo.toString()); - } - - /** - * The public constructor for creating a GLF object - * - * @param writeTo the location to write to - */ - public GLFWriter(OutputStream writeTo) { - outputBinaryCodec = new BinaryCodec(writeTo); - outputBinaryCodec.setOutputFileName(writeTo.toString()); - } - - /** - * Write out the header information for the GLF file. The header contains - * the magic number, the length of the header text, the text itself, the reference - * sequence (null terminated) preceeded by it's length, and the the genomic - * length of the reference sequence. - * - * @param headerText the header text to write - */ - public void writeHeader(String headerText) { - this.headerText = headerText; - for (short aGlfMagic : glfMagic) { - outputBinaryCodec.writeUByte(aGlfMagic); - } - if (!(headerText.equals(""))) { - outputBinaryCodec.writeString(headerText, true, true); - } else { - outputBinaryCodec.writeInt(0); - } - } - - /** - * add a point genotype to the GLF writer - * - * @param contig the name of the contig you're calling in - * @param refBase the reference base, as a char - * @param genomicLoc the location the location on the reference contig - * @param readDepth the read depth at the specified postion - * @param rmsMapQ the root mean square of the mapping quality - * @param lhValues the GenotypeLikelihoods object, representing the genotype likelyhoods - */ - public void addCall(SAMSequenceRecord contig, - int genomicLoc, - float rmsMapQ, - char refBase, - int readDepth, - LikelihoodObject lhValues) { - if ( headerText == null ) - throw new IllegalStateException("The GLF Header must be written before calls can be added"); - - // check if we've jumped to a new contig - checkSequence(contig.getSequenceName(), contig.getSequenceLength()); - - GLFSingleCall callGLF = new GLFSingleCall(contig.getSequenceName(), - refBase, - genomicLoc, - readDepth, - (short) rmsMapQ, - lhValues.toDoubleArray()); - lastPos = genomicLoc; - callGLF.write(this.outputBinaryCodec,mLastRecord); - mLastRecord = callGLF; - } - - /** - * Add a genotype, given a variant context - * - * @param vc the variant context representing the call to add - * @param refBase not used by this writer - */ - public void add(VariantContext vc, byte refBase) { - throw new UnsupportedOperationException("We no longer support writing GLF"); - } - - /** - * add a variable length (indel, deletion, etc) to the genotype writer - * - * @param contig the name of the contig you're calling in - * @param refBase the reference base - * @param genomicLoc the location on the reference contig - * @param readDepth the read depth at the specified postion - * @param rmsMapQ the root mean square of the mapping quality - * @param firstHomZyg the first homozygous call - * @param secondHomZyg the second homozygous call - * @param hetLikelihood the negitive log likelihood of the heterozygote, from 0 to 255 - */ - public void addVariableLengthCall(SAMSequenceRecord contig, - int genomicLoc, - float rmsMapQ, - int readDepth, - char refBase, - IndelLikelihood firstHomZyg, - IndelLikelihood secondHomZyg, - byte hetLikelihood) { - - if ( headerText == null ) - throw new IllegalStateException("The GLF Header must be written before calls can be added"); - - // check if we've jumped to a new contig - checkSequence(contig.getSequenceName(), contig.getSequenceLength()); - - // normalize the two - GLFVariableLengthCall call = new GLFVariableLengthCall(contig.getSequenceName(), - refBase, - genomicLoc - lastPos, - readDepth, - (short) rmsMapQ, - firstHomZyg.getLikelihood(), - secondHomZyg.getLikelihood(), - hetLikelihood, - firstHomZyg.getLengthOfIndel(), - firstHomZyg.getIndelSequence(), - secondHomZyg.getLengthOfIndel(), - secondHomZyg.getIndelSequence()); - lastPos = genomicLoc; - call.write(this.outputBinaryCodec,mLastRecord); - mLastRecord = call; - } - - /** - * add a GLF record to the output file - * - * @param contigName the contig name - * @param contigLength the contig length - * @param rec the GLF record to write. - */ - public void addGLFRecord(String contigName, int contigLength, GLFRecord rec) { - if ( headerText == null ) - throw new IllegalStateException("The GLF Header must be written before records can be added"); - - checkSequence(contigName, contigLength); - rec.write(this.outputBinaryCodec,mLastRecord); - mLastRecord = rec; - } - - /** - * check to see if we've jumped to a new contig - * - * @param sequenceName the name for the sequence - * @param seqLength the sequence length - */ - private void checkSequence(String sequenceName, int seqLength) { - if ((referenceSequenceName == null) || (!referenceSequenceName.equals(sequenceName))) { - if (this.referenceSequenceName != null) { // don't write the record the first time - this.writeEndRecord(); - } - referenceSequenceName = sequenceName; - referenceSequenceLength = seqLength; - lastPos = 1; - addSequence(); - } - } - - - /** add a sequence definition to the glf */ - private void addSequence() { - if ( headerText == null ) - throw new IllegalStateException("The GLF Header must be written before sequences can be added"); - - outputBinaryCodec.writeString(referenceSequenceName, true, true); - outputBinaryCodec.writeUInt(referenceSequenceLength); - } - - /** write end record */ - private void writeEndRecord() { - if ( headerText == null ) - throw new IllegalStateException("The GLF Header must be written before records can be added"); - - outputBinaryCodec.writeUByte((short) 0); - } - - - /** - * close the file. You must close the file to ensure any remaining data gets - * written out. - */ - public void close() { - writeEndRecord(); - outputBinaryCodec.close(); - } -} - - diff --git a/archive/java/src/org/broadinstitute/sting/glf/GLFWriterUnitTest.java b/archive/java/src/org/broadinstitute/sting/glf/GLFWriterUnitTest.java deleted file mode 100755 index d0b5158e9..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/GLFWriterUnitTest.java +++ /dev/null @@ -1,174 +0,0 @@ -package org.broadinstitute.sting.utils.genotype.glf; - -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.genotype.LikelihoodObject; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; - -import net.sf.samtools.SAMSequenceRecord; -import net.sf.picard.reference.IndexedFastaSequenceFile; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class GLFRecordTest - *

- * Tests for the GLFRecord class - */ -public class GLFWriterUnitTest extends BaseTest { - - /** some made up values that we use to generate the GLF */ - private final String header = ""; - private static final int GENOTYPE_COUNT = 10; - private GLFWriter rec; - protected static final String[] genotypes = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"}; - protected final static double SIGNIFICANCE = 5.1; - - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - @BeforeMethod - public void before() { - - } - - @BeforeClass - public void beforeTests() { - File referenceFile = new File(b36KGReference); - try { - seq = new CachingIndexedFastaSequenceFile(referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - genomeLocParser = new GenomeLocParser(seq); - - } - - /** - * create a fake genotype likehoods set - * @param bestGenotype the best genotype, as an index into the array of values - * @return fake genotype likelihoods - */ - private LikelihoodObject createLikelihoods(int bestGenotype) { - double lk[] = new double[GENOTYPE_COUNT]; - for (int x = 0; x < GENOTYPE_COUNT; x++) { - lk[x] = -15.0 - (double) x; // they'll all be unique like a snowflake - } - lk[bestGenotype] = -10.0; // lets make the best way better - return new LikelihoodObject(lk, LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); - } - - /** - * create a fake genotype likelihhods set with a minimum likelihood greater than 255 - * @param bestGenotype the best genotype, as an index into the array of values - * @return fake genotype likelihoods - */ - private LikelihoodObject createGreaterThan255MinimumGenotype(int bestGenotype) { - double lk[] = new double[GENOTYPE_COUNT]; - for (int x = 0; x < GENOTYPE_COUNT; x++) { - lk[x] = -355.0 - (double) x; // they'll all be unique like a snowflake - } - lk[bestGenotype] = -256.0; // lets make the best way better - return new LikelihoodObject(lk, LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); - } - - - /** - * can we actually write a file? - */ - @Test - public void basicWrite() { - File writeTo = new File("testGLF.glf"); - writeTo.deleteOnExit(); - - rec = new GLFWriter(writeTo); - rec.writeHeader(header); - for (int x = 0; x < 100; x++) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(seq.getSequenceDictionary().getSequence(1).getSequenceName(), x + 1); - rec.addCall(new SAMSequenceRecord("test", 0), (int)loc.getStart(), 10, 'A', 9, createLikelihoods(x % 10)); - } - rec.close(); - - } - - /** - * can we actually write a file? - */ - @Test - public void basicWriteGreaterMinimumLikelihood() { - File writeTo = new File("testGLF2.glf"); - writeTo.deleteOnExit(); - - rec = new GLFWriter(writeTo); - rec.writeHeader(header); - for (int x = 0; x < 5; x++) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(seq.getSequenceDictionary().getSequence(1).getSequenceName(), x + 1); - rec.addCall(new SAMSequenceRecord("test", 0), (int)loc.getStart(), 10, 'A', 9, createGreaterThan255MinimumGenotype(x % 10)); - } - rec.close(); - - } - - /** - * write a bunch of fake records a GLF file, and then read it back from the - * same file. We want to make sure a round trip is successful; that we write - * and then read the same information back. - */ - @Test - public void basicWriteThenRead() { - File writeTo = new File("testGLF2.glf"); - writeTo.deleteOnExit(); - rec = new GLFWriter(writeTo); - rec.writeHeader(header); - for (int x = 0; x < 100; x++) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(seq.getSequenceDictionary().getSequence(1).getSequenceName(), x + 1); - rec.addCall(new SAMSequenceRecord("test", 0), (int)loc.getStart(), 10, 'A', 9, createLikelihoods(x % 10)); - } - rec.close(); - GLFReader reader = new GLFReader(writeTo); - int count = 0; - while (reader.hasNext()) { - reader.next(); - count++; - } - Assert.assertEquals(100, count); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/IndelLikelihood.java b/archive/java/src/org/broadinstitute/sting/glf/IndelLikelihood.java deleted file mode 100644 index ae969c254..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/IndelLikelihood.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.broadinstitute.sting.utils.genotype; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class IndelLikelihood - *

- * The representation of an indel allele. - */ -// TODO -- DELETE ME GLF -public class IndelLikelihood { - - protected double loglikelihood; - protected short lengthOfIndel; - protected short[] indelSequence; - - /** - * Create a likelihood object for an indel call - * - * @param likelihood the likelihood (represented as a negitive log likelihood, - * with a ceiling of 255. - * @param indelSequence the indel sequence, not null terminated - */ - public IndelLikelihood( byte likelihood, String indelSequence ) { - this.loglikelihood = likelihood; - this.lengthOfIndel = (short)indelSequence.length(); - this.indelSequence = new short[indelSequence.length()]; - for (int tmp = 0; tmp < indelSequence.length(); tmp++) { - this.indelSequence[tmp] = (short)indelSequence.charAt(tmp); - } - } - - /** - * getter methods - */ - - public double getLikelihood() { - return loglikelihood; - } - - public short getLengthOfIndel() { - return lengthOfIndel; - } - - public short[] getIndelSequence() { - return indelSequence; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObject.java b/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObject.java deleted file mode 100755 index 7227278e5..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObject.java +++ /dev/null @@ -1,290 +0,0 @@ -package org.broadinstitute.sting.utils.genotype; - -import edu.mit.broad.picard.genotype.DiploidGenotype; -import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.HashMap; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class LikelyhoodObject - *

- * An object used to store likelyhood information for genotypes. Genotype - * likelihoods are assumed to be infinite (negitive log likelihood), unless set. - * This allows the consumer to make an empty LikelihoodObject, and just set - * those values which have associated likelihood values. - */ - -// TODO -- DELETE ME GLF -public class LikelihoodObject { - - - // our possible genotypes, in order according to GLFv3 - public enum GENOTYPE { - AA, AC, AG, AT, CC, CG, CT, GG, GT, TT - } - - // our pileup of bases - //final private String basePileup; - - // possible types of likihoods to store - - public enum LIKELIHOOD_TYPE { - NEGATIVE_LOG, LOG, RAW; - } - - // our liklihood storage type - protected LIKELIHOOD_TYPE mLikelihoodType = LIKELIHOOD_TYPE.NEGATIVE_LOG; - - // default the bestGenotype likelihoods to the allele AA - protected GENOTYPE bestGenotype = GENOTYPE.AA; - - // how many genotypes we're storing - public static final int genoTypeCount = GENOTYPE.values().length; - - // the associated negitive log likelihoods for each genotype - protected final HashMap likelihoods = new HashMap(); - - /** create a blank likelihood object */ - public LikelihoodObject() { - for (GENOTYPE type : GENOTYPE.values()) { - likelihoods.put(type, Double.MAX_VALUE); - } - } - - /** - * create a likelihood object, given a picard style GenotypeLikelihoods object. The - * GenotypeLikelihoods stores likelihoods in log likelihood format, and we want them in - * negitive log likelihood - * - * @param lk the likelihood object - */ - public LikelihoodObject(GenotypeLikelihoods lk) { - mLikelihoodType = LIKELIHOOD_TYPE.LOG; - Double minValue = Double.MAX_VALUE; - for (GENOTYPE type : GENOTYPE.values()) { - byte[] bases = new byte[2]; - bases[0] = (byte) type.toString().charAt(0); - bases[1] = (byte) type.toString().charAt(1); - double val = -1.0d * lk.getLikelihood(DiploidGenotype.fromBases(bases)); - likelihoods.put(type, val); - if (val < minValue) { - bestGenotype = type; - } - } - } - - /** - * create a likelyhood object, given an array of genotype scores in GLFv3 ordering - * - * @param values an array of int's from 0 to 255, representing the negitive log likelihoods. - * @param type the likelihood storage type - */ - public LikelihoodObject(double[] values, LIKELIHOOD_TYPE type) { - mLikelihoodType = type; - if (values.length != GENOTYPE.values().length) { - throw new IllegalArgumentException("invalid array passed to LikelihoodObject, should be size " + GENOTYPE.values().length); - } - findBestLikelihood(values); - } - - /** - * find the best likelihood - * @param values - */ - private void findBestLikelihood(double[] values) { - int index = 0; - double lowestScore = Double.MAX_VALUE; - for (GENOTYPE t : GENOTYPE.values()) { - likelihoods.put(t, values[index]); - if (values[index] < lowestScore) { - lowestScore = values[index]; - bestGenotype = t; - } - ++index; - } - } - - /** - * set the likelihood, given it's probability and the genotype - * - * @param type the genotype - * @param lh the likelihood as a double - */ - public void setLikelihood(GENOTYPE type, double lh) { - likelihoods.put(type, lh); - if (lh < likelihoods.get(this.bestGenotype)) { - this.bestGenotype = type; - } - } - - /** - * find the minimum likelihood value stored in the set. This represents the most likely genotype, - * since genotypes are represented as negitive log likeihoods - * - * @return the min value - */ - public double getBestLikelihood() { - return likelihoods.get(this.bestGenotype); - } - - /** - * return a byte array representation of the likelihood object, in GLFv3 specified order. - * The return type is short[] instead of byte[], since signed bytes only store -127 to 127, - * not the 255 range we need. - * - * @return a byte array of the genotype values - */ - public short[] toByteArray() { - short ret[] = new short[GENOTYPE.values().length]; - int index = 0; - for (GENOTYPE type : GENOTYPE.values()) { - ret[index] = (likelihoods.get(type).intValue() > 254) ? 255 : (short) likelihoods.get(type).intValue(); - ++index; - } - return ret; - } - - /** - * create a float array of our genotype values, in order specified in the GENOTYPE enum (currently the GLF and - * geli ordering). - * - * @return a float array containing our genotype likelihoods, as negitive log likelihoods - */ - public double[] toDoubleArray() { - // make an array of floats - double[] ft = new double[10]; - int index = 0; - for (GENOTYPE T : GENOTYPE.values()) { - ft[index] = this.likelihoods.get(T).doubleValue(); - index++; - } - return ft; - } - - /** - * convert this object, with aditional information, to a GenotypeLikelihoods object. This involves determining - * what our underlying storage type is, and coverting our values to the appropriate (log likelihood) format. - * - * @return a GenotypeLikelihoods object representing our data - */ - public GenotypeLikelihoods convertToGenotypeLikelihoods(SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase) { - double[] ft = toDoubleArray(); - float[] db = new float[ft.length]; - int index = 0; - if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGATIVE_LOG) { - for (; index < ft.length; index++) { - db[index] = ((float) ft[index] * -1.0f); - } - } else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) { - for (; index < ft.length; index++) { - db[index] = (float) Math.log(ft[index]); - } - } else { - for (int x = 0; x < ft.length; x++) - db[x] = (float)ft[x]; - } - return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, db); - } - - /** - * getter for the likelihood type - * - * @return our likelihood storage type - */ - public LIKELIHOOD_TYPE getLikelihoodType() { - return mLikelihoodType; - } - - - /** - * validate a genotype score - * - * @param score the score to validate - */ - public void validateScore(double score) { - int x = 0; - switch (mLikelihoodType) { - case NEGATIVE_LOG: - if (score < 0) - throw new ReviewedStingException("Likelikhood score of " + score + " is invalid, for NEGATIVE_LOG it must be greater than or equal to 0"); - break; - case LOG: - if (score > 0) - throw new ReviewedStingException("Likelikhood score of " + score + " is invalid, for LOG it must be less than or equal to 0"); - break; - case RAW: - if (score < 0 || score > 1) - throw new ReviewedStingException("Likelikhood score of " + score + " is invalid, for RAW it must be [0,1]"); - break; - } - } - - - /** - * set our likelihood storage type, and adjust our current likelihood values to reflect - * the new setting. - * - * @param likelihood the type to set the values to. - */ - public void setLikelihoodType(LIKELIHOOD_TYPE likelihood) { - if (likelihood == mLikelihoodType) - return; - if (mLikelihoodType == LIKELIHOOD_TYPE.RAW) { - double mult = 1.0; - if (likelihood == LIKELIHOOD_TYPE.NEGATIVE_LOG) { - mult = -1.0; - } - // one of us in log, the other negitive log, it doesn't matter which - for (GENOTYPE g : likelihoods.keySet()) { - likelihoods.put(g, -1.0 * Math.log(likelihoods.get(g))); - } - } else if (likelihood == LIKELIHOOD_TYPE.RAW) { - double mult = 1.0; - if (mLikelihoodType == LIKELIHOOD_TYPE.NEGATIVE_LOG) { - mult = -1.0; - } - // one of us in log, the other negitive log, it doesn't matter which - for (GENOTYPE g : likelihoods.keySet()) { - likelihoods.put(g, Math.pow(likelihoods.get(g) * mult, 10)); - } - } else { - // one of us in log, the other negitive log, it doesn't matter which - for (GENOTYPE g : likelihoods.keySet()) { - likelihoods.put(g, -1.0 * likelihoods.get(g)); - } - } - this.mLikelihoodType = likelihood; - } -} - - diff --git a/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObjectUnitTest.java b/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObjectUnitTest.java deleted file mode 100755 index 260a0f2a6..000000000 --- a/archive/java/src/org/broadinstitute/sting/glf/LikelihoodObjectUnitTest.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.utils.genotype; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.BaseTest; - -import static org.testng.Assert.assertTrue; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class LikelihoodObjectUnitTest - *

- * Tests the Likelihood object. - */ -public class LikelihoodObjectUnitTest extends BaseTest { - - private LikelihoodObject mLO = null; - - @BeforeMethod - public void before() { - mLO = new LikelihoodObject(); - } - - @Test - public void testBlankConstruction() { - mLO = new LikelihoodObject(); - assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); - } - - @Test - public void testConstructionFromArray() { - double[] ray = new double[10]; - for (int x = 0; x < 10; x++) { - ray[x] = ( x * 25 ); - } - mLO = new LikelihoodObject(ray,LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); - assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); - - int index = 0; - for (LikelihoodObject.GENOTYPE t : LikelihoodObject.GENOTYPE.values()) { - assertTrue(ray[index] == mLO.likelihoods.get(t)); - ++index; - } - } - - @Test - public void testByteArrayReturn() { - double[] ray = new double[10]; - for (int x = 0; x < 10; x++) { - ray[x] = ( x * 25.0 ); - } - mLO = new LikelihoodObject(ray,LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); - assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); - - int index = 0; - short[] ret = mLO.toByteArray(); - for (index = 0; index < ret.length; index++) { - assertTrue(ray[index] == ret[index]); - } - } - - @Test - public void testDefaultArrayValues() { - mLO = new LikelihoodObject(); - short[] ret = mLO.toByteArray(); - for (int index = 0; index < ret.length; index++) { - assertTrue(ret[index] == 255); - } - } - - @Test - public void testGetMinimum() { - double[] ray = new double[10]; - for (int x = 0; x < 10; x++) { - ray[x] = ( 240.0 ); - ray[x] = ( 240.0 ); - } - ray [5] = 0; - mLO = new LikelihoodObject(ray, LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); - assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); - short smallest = (short)mLO.getBestLikelihood(); - assertTrue(smallest == 0); - int index = 0; - short[] ret = mLO.toByteArray(); - for (index = 0; index < ret.length; index++) { - assertTrue(smallest <= ret[index]); - } - } - - - @Test - public void testSetLikelihood() { - mLO = new LikelihoodObject(); - for (LikelihoodObject.GENOTYPE t : LikelihoodObject.GENOTYPE.values()) { - mLO.setLikelihood(t,128); - } - assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); - - int index = 0; - short[] ret = mLO.toByteArray(); - for (index = 0; index < ret.length; index++) { - assertTrue(ret[index] == 128); - } - } - - -} diff --git a/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceAssessor.java b/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceAssessor.java deleted file mode 100755 index 1f7a85df3..000000000 --- a/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceAssessor.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.graphalign; - -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.*; -import java.io.*; - -import net.sf.samtools.SAMRecord; - -/** - * A completely experimental read walker that consumes a graphical reference emitted by GraphReferenceBuilder as a - * serialized java object and evaluates the number of mismatches to both the flat reference and the graphical - * reference for each read [Not for public use and will change drastically in the future]. - */ -public class GraphReferenceAssessor extends ReadWalker { - @Output - PrintStream out; - - @Argument(fullName="graphFile", shortName="GF", doc="", required=true) - String graphFile = null; - ObjectInputStream graphSerialStream = null; - - @Argument(fullName="MAX", shortName="MAX", doc="", required=false) - int MAXREADS = -1; - - @Argument(fullName="ignore0MM", shortName="I0", doc="", required=false) - boolean IGNORE_0_MM = false; - - @Argument(fullName="DEBUG", shortName="DB", doc="", required=false) - int DEBUG_LEVEL = 0; - - static boolean DEBUG = false; - static boolean DEBUG2 = false; // crazy level - - @Argument(fullName="read", doc="", required=false) - String onlyDoRead = null; - - ReferenceGraph graphRef = null; - - public void initialize() { - super.initialize(); - - DEBUG = DEBUG_LEVEL > 0; - DEBUG2 = DEBUG_LEVEL > 1; // crazy level - - try { - logger.info("Reading graph reference " + graphFile ); - graphSerialStream = new ObjectInputStream( new FileInputStream( graphFile ) ); - graphRef = (ReferenceGraph)graphSerialStream.readObject(); - graphRef.setDebugPrinting(DEBUG); - graphRef.validateGraph(); - logger.info(graphRef.toBriefString()); - } catch ( FileNotFoundException e ) { - throw new StingException("Couldn't open file " + graphFile, e); - } catch ( IOException e ) { - throw new StingException("Couldn't write to file " + graphFile, e); - } catch ( ClassNotFoundException e ) { - throw new StingException("Couldn't read ReferenceGraph from file " + graphFile, e); - } - } - - private static MismatchCounter countMismatches(byte[] ref, int refOffset, byte[] bases, byte[] quals, int basesOffset, int length) { - MismatchCounter mm = new MismatchCounter(); - - for ( int i = 0; i < length; i++ ) { - byte rawRefBase = ref[i + refOffset]; - byte rawReadBase = bases[i + basesOffset]; - int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawRefBase); - int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase); - - boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase; - if ( mmP ) { - mm.nMM++; - mm.qSum += quals != null ? quals[i + basesOffset] : 0; - } - - if ( GraphReferenceAssessor.DEBUG2 ) - System.out.printf("%s%d %c %c %s %b%n", Utils.dupString(' ', basesOffset + 2), basesOffset, (char)rawRefBase, (char)rawReadBase, mm, mmP); - } - - return mm; - } - - private static MismatchCounter countMismatches(byte[] ref, byte[] bases, byte[] quals) { - return countMismatches(ref, 0, bases, quals, 0, bases.length); - } - - private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Collection frags, int fragOffset, byte[] bases, byte[] quals, int readOffset ) { - if ( frags.size() == 0 ) - throw new RuntimeException("Fragment list is empty!"); - - MismatchCounter minNMM = MismatchCounter.MAX_VALUE; - - for ( Fragment next : frags ) { - MismatchCounter recNMM = countMismatchesOnGraph( graph, next, 0, bases, quals, readOffset ); - minNMM = minNMM.min( recNMM ); - } - - return minNMM; - } - - private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Fragment frag, int fragOffset, byte[] bases, byte[] quals, int readOffset ) { - if ( GraphReferenceAssessor.DEBUG )System.out.printf("%sfrag %s -> %d%n", Utils.dupString(' ', readOffset + 2), frag, readOffset); - - MismatchCounter mm = new MismatchCounter(); - - if ( readOffset < bases.length ) { - int nRemainingBases = bases.length - readOffset; - int cmpLength = frag.getBaseLengthFrom(fragOffset, nRemainingBases); // how many bases over in the fragment are we from the offset - MismatchCounter fragMM = countMismatches(frag.getUnderlyingBases(), frag.getUnderlyingOffset() + fragOffset, bases, quals, readOffset, cmpLength); - mm.add(fragMM); - -// // still have some counting to do -// for ( int i = 0; i < baseLength; i++ ) { -// int fragBaseOffset = fragOffset + i; -// int readBaseOffset = readOffset + i; -// -// byte rawFragBase = frag.getBase(fragBaseOffset); -// byte rawReadBase = bases[readBaseOffset]; -// int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawFragBase); -// int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase); -// -// boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase; -// if ( mmP ) nMM++; - - if ( nRemainingBases > cmpLength ) { - MismatchCounter recMM = countMismatchesOnGraph( graph, graph.outgoingFragments(frag), 0, bases, quals, readOffset + cmpLength ); - mm.add(recMM); - } - } - - if ( GraphReferenceAssessor.DEBUG ) System.out.printf("%s=> %s%n", Utils.dupString(' ', readOffset + 2), mm); - return mm; - } - - private static MismatchCounter countMismatchesOnGraph(ReferenceGraph graph, SAMRecord read) { - if ( GraphReferenceAssessor.DEBUG ) System.out.printf("countMismatchesOnGraph( read=%s%n", read.getReadName()); - GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); - MismatchCounter minNMM = MismatchCounter.MAX_VALUE; - - for ( Fragment frag : graph.getStartingFragment(loc) ) { - int fragOffset = frag.getFragOffsetFrom(loc); // how many bases over in the fragment are we from the offset - - if ( GraphReferenceAssessor.DEBUG ) - System.out.printf(" countMismatchesOnGraph frag=%s loc=%s bases=%s offset=%d%n", frag, loc, read.getReadString(), fragOffset); - - MismatchCounter recNMM = countMismatchesOnGraph(graph, frag, fragOffset, read.getReadBases(), read.getBaseQualities(), 0); - minNMM = minNMM.min( recNMM ); - } - - return minNMM; - } - - public Integer map(ReferenceContext refArg, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - if ( MAXREADS-- == 0 ) { - System.exit(0); - } else if ( onlyDoRead != null && ! read.getReadName().equals(onlyDoRead) ) { - ; - } else if ( ! read.getReadUnmappedFlag() && read.getCigar().numCigarElements() == 1 ) { - try { - byte[] ref = refArg.getBases(); - // we're all XM - int nMMFromRead = (Short)read.getAttribute("NM"); - MismatchCounter nAlignedMM = countMismatches(ref, read.getReadBases(), read.getBaseQualities()); - if ( ! IGNORE_0_MM || nAlignedMM.nMM > 0 ) { - MismatchCounter nGraphMM = countMismatchesOnGraph(graphRef, read); - MismatchCounter deltaMM = nAlignedMM.minus(nGraphMM); - - out.printf("%50s with %5s at %10s: mismatches: %3d (delta %3d) -- %3d %3d -- %3d %3d -- delta %3d %3d%n", - read.getReadName(), read.getCigarString(), GenomeLocParser.createGenomeLoc(read), - nMMFromRead, nMMFromRead - nAlignedMM.nMM, - nAlignedMM.nMM, nAlignedMM.qSum, - nGraphMM.nMM, nGraphMM.qSum, - deltaMM.nMM, deltaMM.qSum); - - if ( deltaMM.nMM < 0 || deltaMM.qSum < 0 ) - throw new StingException(read.getReadName() + " is miscalculated"); - } - } catch ( Exception e ) { - System.out.printf("Exception at %s at %s%n", read.getReadName(), GenomeLocParser.createGenomeLoc(read)); - throw new RuntimeException(e); - } - } else { - ; // don't do anything - } - - return 0; - } - - /** - * reduceInit is called once before any calls to the map function. We use it here to setup the output - * bam file, if it was specified on the command line - * - * @return - */ - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer readScore, Integer data) { - return data + readScore; - } - - public void onTraversalDone(Integer data) { - out.printf(data.toString()); - } -} - -class MismatchCounter { - int nMM = 0; - int qSum = 0; - - public static MismatchCounter MAX_VALUE = new MismatchCounter(Integer.MAX_VALUE, Integer.MAX_VALUE ); - - public MismatchCounter() {} - - public MismatchCounter(int nMM, int qSum) { - this.nMM = nMM; - this.qSum = qSum; - } - - public void add(MismatchCounter that) { - this.nMM += that.nMM; - this.qSum += that.qSum; - } - - public MismatchCounter min(MismatchCounter that) { - int cmpQSum = Integer.valueOf(this.qSum).compareTo(that.qSum); - if ( cmpQSum < 0 ) { return this; } - else if ( cmpQSum > 0 ) { return that; } - else if ( this.nMM < that.nMM ) { return this; } - else if ( this.nMM > that.nMM ) { return that; } - else { return this; } - } - - public MismatchCounter minus(MismatchCounter that) { - return new MismatchCounter(this.nMM - that.nMM, this.qSum - that.qSum); - } - - public String toString() { return String.format("[MM %d %d]", nMM, qSum); } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceBuilder.java b/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceBuilder.java deleted file mode 100755 index c732fb897..000000000 --- a/archive/java/src/org/broadinstitute/sting/graphalign/GraphReferenceBuilder.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.graphalign; - -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.util.Arrays; -import java.util.List; - -/** - * A completely experimental walker that constructs a graphical reference that incorporates variation from provided - * RODs [Not for public use and will change drastically in the future]. - */ -@WalkerName("GraphReferenceBuilder") -@Requires(value={DataSource.REFERENCE}) -public class GraphReferenceBuilder extends RefWalker { - @Argument(fullName="graphFile", shortName="GF", doc="", required=true) - String graphFile = null; - - @Argument(fullName="DEBUG", shortName="DB", doc="", required=false) - boolean DEBUG = false; - - @Argument(fullName="VALIDATE", shortName="VD", doc="", required=false) - boolean VALIDATE_GRAPH = false; - - @Argument(fullName="printFrequency", shortName="F", doc="", required=false) - int printFrequency = 10000; - - ObjectOutputStream graphSerialStream = null; - - ReferenceGraph graphRef = null; - ReferenceSequenceFile flatReferenceFile = null; - - public void initialize() { - super.initialize(); - - graphRef = new ReferenceGraph(DEBUG); - - try { - graphSerialStream = new ObjectOutputStream( new FileOutputStream( graphFile ) ); - } catch ( FileNotFoundException e ) { - throw new StingException("Couldn't open file " + graphFile, e); - } catch ( IOException e ) { - throw new StingException("Couldn't write to file " + graphFile, e); - } - - flatReferenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.getToolkit().getArguments().referenceFile); - - ReferenceSequence refSeq = flatReferenceFile.nextSequence(); - do { - //logger.info("Read " + refSeq); - graphRef.bindRefenceSequence(refSeq); - logger.info(String.format("contig %s has length %d", refSeq.getName(), refSeq.length())); - refSeq = flatReferenceFile.nextSequence(); - } while ( refSeq != null ); - - System.out.println(graphRef.toBriefString()); - } - - int counter = printFrequency; - public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { -// if ( context.getLocation().getStart() == 59384 ) { -// try { -// Thread.currentThread().sleep(5000); -// } catch (InterruptedException e) { -// ; -// } -// } - - boolean alreadyAddedAtThisLoc = false; - for ( VariantContext vc : rodData.getAllVariantContexts(ref)) { - if ( ! alreadyAddedAtThisLoc ) { - // if we have multiple variants at a locus, just take the first damn one we see for now - // todo -- getAlternativeBases should be getAlleles() - GenomeLoc loc = VariantContextUtils.getLocation(vc); - String[] allAllelesList = null; // variant.getAlternateBases().split(""); // todo fixme - if ( allAllelesList.length >= 3 ) { // bad dbSNP format :-( - List alleles = Arrays.asList(allAllelesList).subList(1,3); - //logger.info(String.format("Adding %s %s", loc, alleles)); - graphRef.addVariation(vc, loc, alleles); - //logger.info(String.format(" Added %s %s", loc, alleles)); - alreadyAddedAtThisLoc = true; - if ( counter-- == 0 ) { - logger.info(String.format("Added %s %s %s", loc, alleles, graphRef.toBriefString())); - counter = printFrequency; - if ( VALIDATE_GRAPH ) - graphRef.validateGraph(); - } - } - } - } - - return null; - } - - // todo -- graph should be the reduce result - public Integer reduceInit() { - return null; - } - - public Integer reduce(Integer value, Integer sum) { - return sum; - } - - public void onTraversalDone(Integer sum) { - super.onTraversalDone(sum); - try { - graphSerialStream.writeObject(graphRef); - graphSerialStream.close(); - } catch ( IOException e ) { - throw new StingException("Couldn't write to file " + graphFile, e); - } - } -} - diff --git a/archive/java/src/org/broadinstitute/sting/graphalign/ReferenceGraph.java b/archive/java/src/org/broadinstitute/sting/graphalign/ReferenceGraph.java deleted file mode 100755 index 158ebc01b..000000000 --- a/archive/java/src/org/broadinstitute/sting/graphalign/ReferenceGraph.java +++ /dev/null @@ -1,418 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.graphalign; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.jgrapht.graph.DefaultEdge; -import org.jgrapht.graph.SimpleDirectedGraph; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.*; -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.util.IntervalTree; -import net.sf.samtools.util.StringUtil; - -import java.util.*; -import java.io.Serializable; -import java.io.IOException; -import java.io.ObjectInputStream; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Oct 22, 2009 - * Time: 2:15:54 PM - * To change this template use File | Settings | File Templates. - */ -class ReferenceGraph extends SimpleDirectedGraph implements Serializable { - final private static boolean USE_IT = true; - final private static boolean THROW_ERRORS_ON_BAD_INPUTS = false; - - private boolean DEBUG = false; - int nSkippedIndels = 0; - int nSkippedBecauseOfContinguousVariation = 0; - int nBadPolymorphisms = 0; - int nMultiStateAlleles = 0; - - GenomeLoc initialLoc = null; - - private transient IntervalTree loc2Fragment = new IntervalTree(); - - public ReferenceGraph(boolean printDebuggingInfo) { - super(DefaultEdge.class); - DEBUG = printDebuggingInfo; - } - - public ReferenceGraph() { - this(false); - } - - public void setDebugPrinting(boolean enable) { - this.DEBUG = enable; - } - - public void bindRefenceSequence(ReferenceSequence seq) { - GenomeLoc refSeqLoc = GenomeLocParser.createGenomeLoc(seq.getContigIndex(), 1, seq.length()); - String refString = StringUtil.bytesToString(seq.getBases()).toUpperCase(); - Fragment frag = new Fragment(refSeqLoc, 1, StringUtil.stringToBytes(refString)); - addFragment(frag); - initialLoc = refSeqLoc; - } - - private void addFragmentToIntervalTree(Fragment frag) { - loc2Fragment.put((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop(), frag); - } - - private void addFragment(Fragment frag) { - addFragmentToIntervalTree(frag); - addVertex(frag); - } - - private void removeFragment(Fragment frag) { - loc2Fragment.remove((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop()); - removeVertex(frag); - } - - public void validateGraph() { - for ( Fragment v : this.vertexSet() ) { - if ( this.inDegreeOf(v) == 0 && v.getLocation().getStart() != initialLoc.getStart() ) { - throw new StingException(String.format("Fragment %s has no incoming edges but isn't at the start of the contig %s", v, initialLoc)); - } - if ( this.outDegreeOf(v) == 0 && v.getLocation().getStop() != initialLoc.getStop() ) { - throw new StingException(String.format("Fragment %s has no outgoing edges but isn't at the end of the contig %s", v, initialLoc)); - } - } - - //System.out.printf("Passed validation: %s%n", this.toBriefString()); - } - - private void rebuildIntervalTree() { - if ( DEBUG ) System.out.printf("rebuilding IntervalTree()%n"); - for ( Fragment v : this.vertexSet() ) { - if ( DEBUG ) System.out.printf(" adding interval tree: %s%n", v); - addFragmentToIntervalTree(v); - } - } - - private boolean allelesAreInExcisedFragment(Fragment cut, List alleles) { - boolean foundRef = false; - for ( String allele : alleles ) { - if ( allele.equals(cut.getBases()) ) - foundRef = true; - } - - if ( ! foundRef && THROW_ERRORS_ON_BAD_INPUTS ) - throw new StingException(String.format("Polymorphic alleles %s do not contain the reference sequence %s", alleles, cut.getBases())); - - return foundRef; - } - - public void addVariation(VariantContext variant, GenomeLoc loc, List alleles) { - if ( DEBUG ) System.out.printf("addVariation(%s, %s)%n", loc, alleles); - //validateGraph(); - - if ( variant.isSNP() ) { - Fragment frag = getContainingFragment(loc); - - if ( frag == null ) { - nMultiStateAlleles++; - return; - } - - if ( ! allelesAreInExcisedFragment(subFragment(frag, loc, 1), alleles)) { - nBadPolymorphisms++; - return; - } - - List split = exciseFragment(frag, loc); - if ( split != null ) { - Fragment left = split.get(0); - Fragment cut = split.get(1); - Fragment right = split.get(2); - - if ( DEBUG ) System.out.printf(" cutFrag(%s, %s)%n", loc, cut); - - for ( String allele : alleles ) { - byte[] bases = StringUtil.stringToBytes(allele); - double freq = 1.0 / alleles.size(); - Fragment alleleFrag = new Fragment(loc, freq, 0, bases.length, bases); - if ( DEBUG ) System.out.printf(" Creating allele fragment %s%n", alleleFrag); - addFragment(alleleFrag); - if ( left != null ) addEdge(left, alleleFrag); - if ( right != null ) addEdge(alleleFrag, right); - } - } else { - nSkippedBecauseOfContinguousVariation++; - } - } else { - nSkippedIndels++; - } - } - - - private Fragment subFragment(Fragment frag, GenomeLoc loc, double freq ) { - return new Fragment(loc, 1, frag.getUnderlyingBases()); - } - - public List exciseFragment(Fragment frag, GenomeLoc loc) { - if ( DEBUG ) System.out.printf(" exciseFragment(%s, %s)%n", frag, loc); - GenomeLoc fragLoc = frag.getLocation(); - - Fragment cut = subFragment(frag, loc, 1); - - Set inToFrag = incomingEdgesOf(frag); - Set outOfFrag = outgoingEdgesOf(frag); - - Fragment left = null; - if ( fragLoc.getStart() == loc.getStart() ) { - if ( ! inToFrag.isEmpty() ) { - if ( THROW_ERRORS_ON_BAD_INPUTS ) - throw new StingException(String.format("Attempting to create a variation at the start of a fragment %s %s", frag, loc)); - return null; - } - } else { - GenomeLoc leftLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), fragLoc.getStart(), loc.getStart()-1); - left = new Fragment(leftLoc, 1, frag.getUnderlyingBases()); - addFragment(left); - - for ( DefaultEdge e : inToFrag ) { - addEdge(getEdgeSource(e), left); - } - - removeAllEdges(inToFrag); - } - - Fragment right = null; - if ( fragLoc.getStop() == loc.getStop() ) { - if ( ! outOfFrag.isEmpty() ) { - throw new StingException(String.format("Attempting to create a variation at the end of a fragment %s %s", frag, loc)); - } - } else { - GenomeLoc rightLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), loc.getStop()+1, fragLoc.getStop()); - right = new Fragment(rightLoc, 1, frag.getUnderlyingBases()); - addFragment(right); - - for ( DefaultEdge e : outOfFrag ) { - addEdge(right, getEdgeTarget(e)); - } - - removeAllEdges(outOfFrag); - } - - if ( DEBUG ) System.out.printf(" removing %s%n", frag); - removeFragment(frag); - if ( DEBUG ) System.out.printf(" returning left=%s right=%s%n", left, right); - return Arrays.asList(left, cut, right); - } - - public Fragment getContainingFragment(GenomeLoc loc) { - Fragment frag = USE_IT ? getContainingFragmentIT(loc) : getContainingFragmentG(loc); - - - if ( frag == null ) { - if ( THROW_ERRORS_ON_BAD_INPUTS ) - throw new StingException("No spanning fragment was found for " + loc); - else - return null; - } - else if ( frag.getLocation().getStart() > loc.getStart() || frag.getLocation().getStop() < loc.getStop() ) - throw new StingException("BUG: bad spanning fragment found for " + loc + " was " + frag.getLocation() ); - else - return frag; - } - - public Fragment getContainingFragmentG(GenomeLoc loc) { - for ( Fragment v : this.vertexSet() ) { - if ( v.getLocation().containsP(loc) ) { - return v; - } - } - - return null; - } - - public Fragment getContainingFragmentIT(GenomeLoc loc) { - IntervalTree.Node node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStop()); - if ( node == null ) - return null; - else - return node.getValue(); - } - - public Collection getStartingFragment(GenomeLoc loc) { - Collection frags = USE_IT ? getStartingFragmentIT(loc) : getStartingFragmentG(loc); - //Collection frags = getStartingFragmentTest(loc); - - if ( frags == null || frags.size() == 0 ) - throw new StingException("No fragment contains location start of " + loc); - if ( frags.size() == 1 && MathUtils.compareDoubles(frags.iterator().next().getFrequency(), 1.0) != 0 ) { - Fragment bad = frags.iterator().next(); - throw new StingException(String.format("Only one fragment was found but it's frequency < 1 %s with %e", bad, bad.getFrequency())); - } - else - return frags; - } - - public Collection getStartingFragmentTest(GenomeLoc loc) { - Collection fragsFromIT = getStartingFragmentIT(loc); - Collection fragsFromG = getStartingFragmentG(loc); - - if ( fragsFromIT.size() != fragsFromG.size() ) { - throw new StingException(String.format("Fragment sizes differ %d from IntervalTree, %d from graph", fragsFromIT.size(), fragsFromG.size())); - } - - return USE_IT && false ? fragsFromIT : fragsFromG; - } - - public Collection getStartingFragmentIT(GenomeLoc loc) { - Collection frags = new HashSet(); - - Iterator> it = loc2Fragment.overlappers((int)loc.getStart(), (int)loc.getStart()); - IntervalTree.Node node = null; - while ( it.hasNext() ) { - node = it.next(); - frags.add(node.getValue()); - } - - // todo -- painful bug work around -- should be removed - if ( frags.size() == 1 && MathUtils.compareDoubles(node.getValue().getFrequency(), 1.0) != 0 ) { - System.out.printf(">>> Using IT workaround at %s <<<%n", loc); - return getStartingFragmentG(loc); - } - - return frags; -// IntervalTree.Node node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStart()); -// if ( node == null ) -// return null; -// else -// return node.getValue(); - } - - public Collection getStartingFragmentG(GenomeLoc loc) { - Collection frags = new HashSet(); - for ( Fragment v : this.vertexSet() ) { - //if ( v.getLocation().getStart() < loc.getStop() ) - // System.out.printf("Checking %s vs. %s%n", loc, v.getLocation()); - if ( v.getLocation().containsStartPosition(loc.getStart()) ) { - // System.out.printf(" Adding %s%n", v.getLocation()); - frags.add(v); - } - } - - return frags; - } - - public Set outgoingFragments( Fragment frag ) { - Set outgoingFrags = new HashSet(); - - for ( DefaultEdge e : outgoingEdgesOf(frag) ) { - outgoingFrags.add(getEdgeTarget(e)); - } - - if ( outgoingFrags.size() == 0 && frag.getLocation().getStop() != initialLoc.getStop() ) { - - } - - return outgoingFrags; - } - - public String toString() { - StringBuilder s = new StringBuilder(); - - for ( Fragment v : this.vertexSet() ) { - s.append(String.format("Fragment: %s%n", v.toString())); - for ( DefaultEdge e : this.incomingEdgesOf(v) ) { - s.append(String.format(" [IN FROM] %s%n", this.getEdgeSource(e))); - } - for ( DefaultEdge e : this.outgoingEdgesOf(v) ) { - s.append(String.format(" [OUT TO ] %s%n", this.getEdgeTarget(e))); - } - } - - return s.toString(); - } - - public String toBriefString() { - return String.format("GraphRef: %d fragments, %d edges, skipped %d contingous variants, %d indels, %d polymorphisms w/o ref allele, %d multi-state", - this.vertexSet().size(), this.edgeSet().size(), nSkippedBecauseOfContinguousVariation, nSkippedIndels, nBadPolymorphisms, nMultiStateAlleles); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // serialization - // - // -------------------------------------------------------------------------------------------------------------- - private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { - //always perform the default de-serialization first - stream.defaultReadObject(); - loc2Fragment = new IntervalTree(); - rebuildIntervalTree(); - } -} - -class Fragment implements Serializable { - GenomeLoc loc = null; // Index position of this fragment into the reference - int offset, stop; - double freq = -1; - byte[] bases = null; - - public Fragment( GenomeLoc loc, double freq, int offset, int stop, byte[] bases ) { - this.loc = loc; - this.freq = freq; - this.bases = bases; - this.offset = offset; - this.stop = stop; - } - - public Fragment( GenomeLoc loc, double freq, byte[] bases ) { - this(loc, freq, (int)loc.getStart()-1, (int)loc.getStop(), bases); - } - - public String toString() { - //return String.format("%s:%.2f:%s", loc.toString(), getFrequency(), getBases()); - return String.format("%s:%.2f", loc.toString(), getFrequency()); - } - - public GenomeLoc getLocation() { - return loc; - } - - public double getFrequency() { - return freq; - } - - public int getUnderlyingOffset() { return offset; } - public int getStop() { return stop; } - public int getLength() { return getStop() - getUnderlyingOffset(); } - - public byte[] getUnderlyingBases() { - return bases; - } - - /** - * how many bases over in the fragment are we over in this fragment? - * - * @param loc - * @return - */ - public int getFragOffsetFrom(GenomeLoc loc) { - // todo -- ignores contigs -- can we fix this? - if ( getLocation().getStart() > loc.getStart() ) - throw new StingException("BUG: Request for offset from " + loc + " in frag at " + getLocation() + " but this is beyond the location of the fragment"); - return (int)(loc.getStart() - getLocation().getStart()); - } - - public int getBaseLengthFrom( int fragOffset, int maxLength ) { - int fragRemaining = getLength() - fragOffset; - - if ( fragRemaining < 0 ) - throw new StingException("BUG: Request for length from offset " + fragOffset + " but this is longer than the fragment itself"); - - return Math.min(fragRemaining, maxLength); - } - - public byte getBase(int fragOffset) { - return bases[getUnderlyingOffset() + fragOffset]; - } - - public String getBases() { - return StringUtil.bytesToString(getUnderlyingBases(), getUnderlyingOffset(), getLength()); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/graphalign/package-info.java b/archive/java/src/org/broadinstitute/sting/graphalign/package-info.java deleted file mode 100755 index a129ea97f..000000000 --- a/archive/java/src/org/broadinstitute/sting/graphalign/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -/** - * Totally experimental tools for working with a graphical reference (i.e., one that explicitly represents variation). - * Not reliable, complete, or even optimized. Purely for initial evaluation of the approach - */ -package org.broadinstitute.sting.playground.gatk.walkers.graphalign; \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/BeagleTrioToVCFWalker.java b/archive/java/src/org/broadinstitute/sting/mendelian/BeagleTrioToVCFWalker.java deleted file mode 100755 index 007708deb..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/BeagleTrioToVCFWalker.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; - -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.MendelianViolationEvaluator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; - -/** - * Test routine for new VariantContext object - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class), @RMD(name="beagle",type=BeagleROD.class)}) -public class BeagleTrioToVCFWalker extends RodWalker { - @Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false) - protected String TRIO_STRUCTURE; - - @Argument(shortName="eth", fullName="excludeTripleHets", doc="If provide, sites that are triple hets calls will not be phased, regardless of Beagle's value", required=false) - protected boolean dontPhaseTripleHets = false; - - int nTripletHets = 0; - - private MendelianViolationEvaluator.TrioStructure trio = null; - - private VCFWriter writer; - private boolean headerWritten = false; - private final static String TRACK_NAME = "variants"; - private final static String BEAGLE_NAME = "beagle"; - - public void initialize() { - trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE); - writer = new VCFWriter(out); - } - - public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc = null; - - if ( ref != null ) { - vc = tracker.getVariantContext(ref, TRACK_NAME, null, context.getLocation(), false); - BeagleROD beagle = tracker.lookup(BEAGLE_NAME,BeagleROD.class); - - if ( vc != null ) { - if ( ! headerWritten ) { - VCFRecord vcfrod = tracker.lookup(TRACK_NAME,VCFRecord.class); - writer.writeHeader(vcfrod.getHeader()); - headerWritten = true; - } - - //System.out.printf("VCF: %s%n", tracker.lookup(TRACK_NAME, null)); - vc = maybePhaseVC(vc, beagle); - } - } - - if ( vc != null ) - writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase())); - - return vc; - } - - private VariantContext maybePhaseVC(VariantContext unphased, BeagleROD beagle) { - if ( beagle == null ) { - return unphased; - } else { - Map> bglData = beagle.getGenotypes(); - List momBgl = bglData.get(trio.mom); - List dadBgl = bglData.get(trio.dad); - - Genotype unphasedMom = unphased.getGenotype(trio.mom); - Genotype unphasedDad = unphased.getGenotype(trio.dad); - Genotype unphasedKid = unphased.getGenotype(trio.child); - - if ( dontPhaseTripleHets && unphasedMom.isHet() && unphasedDad.isHet() && unphasedKid.isHet() ) { - nTripletHets++; - return unphased; - } - else { - Allele momTrans = unphased.getAllele(momBgl.get(0)); - Allele momUntrans = unphased.getAllele(momBgl.get(1)); - Allele dadTrans = unphased.getAllele(dadBgl.get(0)); - Allele dadUntrans = unphased.getAllele(dadBgl.get(1)); - - Genotype momG = phaseGenotype(unphasedMom, Arrays.asList(momTrans, momUntrans)); - Genotype dadG = phaseGenotype(unphasedDad, Arrays.asList(dadTrans, dadUntrans)); - Genotype kidG = phaseGenotype(unphasedKid, Arrays.asList(momTrans, dadTrans)); - - return new VariantContext(unphased.getName(), unphased.getLocation(), unphased.getAlleles(), - Arrays.asList(momG, dadG, kidG), unphased.getNegLog10PError(), unphased.getFilters(), unphased.getAttributes()); - } - } - } - - private Genotype phaseGenotype(Genotype base, List alleles) { - return new Genotype(base.getSampleName(), alleles, base.getNegLog10PError(), base.getFilters(), base.getAttributes(), true); - } - - public Long reduceInit() { - return 0L; - } - - public Integer reduce(VariantContext point, Integer sum) { - return sum; - } - - public void onTraversalDone(Long result) { - logger.info(String.format("Ignored phasing of %d het/het/het genotypes", nTripletHets)); - //logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants)); - } - - public Long reduce(VariantContext vc, Long prevReduce) { - return ( vc == null ? prevReduce : prevReduce+1); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/DeNovoSNPWalker.java b/archive/java/src/org/broadinstitute/sting/mendelian/DeNovoSNPWalker.java deleted file mode 100755 index 561e9711e..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/DeNovoSNPWalker.java +++ /dev/null @@ -1,123 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariationRod; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; - -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -/** - * Implements a rudimentary algorithm for calling SNPs that are de novo in that they appear in an individual but not in - * its parents. - */ - -@By(DataSource.REFERENCE) -@Requires(value={DataSource.REFERENCE, DataSource.REFERENCE_BASES, DataSource.READS},referenceMetaData={@RMD(name="child",type= VariationRod.class)}) -@Allows({DataSource.READS, DataSource.REFERENCE}) -@ReadFilters(ZeroMappingQualityReadFilter.class) -//, @RMD(name="parent1",type= VariationRod.class), @RMD(name="parent2",type= VariationRod.class)}) - -public class DeNovoSNPWalker extends RefWalker{ -/** - * Implements a rudimentary algorithm for calling SNPs that are de novo in that they appear in an individual but not in - * its parents. Using BAM files corresponding to parents and child, it calls UnifiedGenotyper directly and outputs a - * confidence for positions being de novo SNPs. - * */ - - UnifiedGenotyper UG; - private List> readGroupSets; - - public void initialize() { - UG = new UnifiedGenotyper(); - UG.initialize(); - - readGroupSets = getToolkit().getMergedReadGroupsByReaders(); - } - - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariationRod child = tracker.lookup("child",VariationRod.class); - VariationRod dbsnp = tracker.lookup(rodDbSNP.STANDARD_DBSNP_TRACK_NAME,VariationRod.class); - - if (child != null) { - if (child.isSNP() && child.getNegLog10PError() > 5) { // BTR > 5 - - List reads = context.getReads(); - List offsets = context.getOffsets(); - - List parent1_reads = new ArrayList(); - List parent2_reads = new ArrayList(); - List parent1_offsets = new ArrayList(); - List parent2_offsets = new ArrayList(); - - assert( reads.size() == offsets.size() ); // should be same number or we're in trouble - int num_reads = reads.size(); - for (int i=0; i 5 && - !parent2call.isHomRef() && - parent2call.getNegLog10PError() > 5) { - - double sumConfidences = 0.5 * (0.5 * child.getNegLog10PError() + - Math.min(parent1call.getNegLog10PError(), parent2call.getNegLog10PError())); - - out.format("%s\t", child.getLocation().getContig()); - out.format("%s\t", child.getLocation().getStart()); - out.format("%.4f\t", sumConfidences); - out.format("%.4f\t", child.getNegLog10PError()); - out.format("%.4f\t", parent1call.getNegLog10PError()); - out.format("%.4f\t", parent2call.getNegLog10PError()); - out.format("%s\t", dbsnp != null); - - out.format ("%s\t", child.toString()); - out.format ("%s\t", parent1.toString()); - out.format ("%s", parent2.toString()); - if (dbsnp != null) - out.format ("\tDBSNP\t:%s", dbsnp.toString()); - out.println(); - } - } - } - } - - return ""; - } - - public Integer reduceInit() { return 0; } - public Integer reduce(String line, Integer a) { - return 1; - } - - public void onTraversalDone(Integer result) {} // Don't print the reduce result -} diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/GenotypingCallStats.java b/archive/java/src/org/broadinstitute/sting/mendelian/GenotypingCallStats.java deleted file mode 100644 index d27d62bcd..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/GenotypingCallStats.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -import org.broadinstitute.sting.utils.Utils; - -/** - * This class is a trivial wrapper for keeping together and passing around a few simple statistics relevant for genotyping: - * e.g. number of covered bases (have any observation at all), number of "assessed" bases (e.g. those with confidence level - * above some cutoff, so that a call was actually made), number of ref/variant calls etc. - * @author asivache - * - */ -public class GenotypingCallStats { - public long covered = 0; // number of loci covered in an individual (not necessarily confidently called) - public long assessed = 0; // number of loci with confident calls - public long ref = 0; // number of assessed loci where the reference is called - public int variant = 0; // number of assessed loci where a variant is observed in the individual - // NOTE: consistent_ref + inconsistent_ref is the total number of ref calls assessed for consistency (by some external application). - // this number does not have to be equal to 'ref' ( total number of ref calls in this individual - we migh be unable to assess - // the consistency for all of them!); same applies to (in)consistent_variant. - public int consistent_variant = 0; // variants that are consistent in any (application-specific) sense, e.g. variant matches variants in other members of the family trio - public long consistent_ref = 0; // reference calls that are consistent in any (app-specific) sense, e.g. consistent with other members of the family trio - public int inconsistent_variant = 0; // variants that are inconsistent in any (application-specific) sense, e.g. variant does not match variants in other members of the family trio - public long inconsistent_ref = 0; // reference calls that are inconsistent in any (app-specific) sense, e.g. inconsistent with other members of the family trio - public int non_biallelic_variant = 0; // number of variant calls that are not biallelic - - public GenotypingCallStats add(GenotypingCallStats other) { - this.covered += other.covered; - this.assessed += other.assessed; - this.ref += other.ref; - this.variant += other.variant; - this.consistent_variant += other.consistent_variant; - this.consistent_ref += other.consistent_ref; - this.inconsistent_variant += other.inconsistent_variant; - this.inconsistent_ref += other.inconsistent_ref; - this.non_biallelic_variant += other.non_biallelic_variant; - return this; - } - -// public int totalVariants() { return consistent_variant + inconsistent_variant + non_biallelic_variant; } - - public long assessedForConsistencyRef() { return consistent_ref + inconsistent_ref; } - public int assessedForConsistencyVariant() { return consistent_variant + inconsistent_variant; } - - public String toString() { - StringBuilder b = new StringBuilder(); - - - b.append( String.format(" covered: %d%n assessed: %d (%3.2f%% covered)%n", - covered, assessed, Utils.percentage(assessed, covered) ) - ); - - b.append( String.format(" ref: %d (%3.2f%% assessed)%n", - ref, Utils.percentage(ref,assessed)) - ); - - long zr = assessedForConsistencyRef(); - b.append( String.format(" ref assessed for consistency: %d (%3.2f%% ref)%n", - zr, Utils.percentage(zr, ref) ) - ); - b.append( String.format(" consistent ref: %d (%3.2f%% consistency-assessed ref)%n", - consistent_ref, Utils.percentage(consistent_ref,zr) ) - ); - - b.append( String.format(" variants: %d (%3.2f%% assessed, or 1 per %3.2f kB)%n", - variant, Utils.percentage(variant,assessed), ((double)assessed/variant)/1000.0 ) - ); - b.append( String.format(" multiallelic: %d (%3.2f%% variants)%n", - non_biallelic_variant, Utils.percentage(non_biallelic_variant, variant)) - ); - - int zv = assessedForConsistencyVariant(); - b.append( String.format(" variants assessed for consistency: %d (%3.2f%% variants)%n", - zv, Utils.percentage(zv, variant) ) - ); - b.append( String.format(" consistent variants: %d (%3.2f%% consistency-assessed variants)%n", - consistent_variant, Utils.percentage(consistent_variant,zv) ) - ); - return b.toString(); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/MendelianInheritanceWalker.java b/archive/java/src/org/broadinstitute/sting/mendelian/MendelianInheritanceWalker.java deleted file mode 100644 index 6d1298344..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/MendelianInheritanceWalker.java +++ /dev/null @@ -1,246 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.Genotype; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.playground.utils.GenotypingCallStats; -import org.broadinstitute.sting.playground.utils.TrioConcordanceRecord; -import org.broadinstitute.sting.utils.GenotypeUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.List; - -//@Requires(value=DataSource.REFERENCE,referenceMetaData={@RMD(name="mother",type=rodSAMPileup.class), -// @RMD(name="father",type=rodSAMPileup.class), -// @RMD(name="daughter",type=rodSAMPileup.class)}) -public class MendelianInheritanceWalker extends RefWalker { - - @Argument(fullName="point_consensus_cutoff", shortName="XPC", doc="confidence cutoff for consensus in point genotype", required=true ) public Double POINT_CONS_CUTOFF; - @Argument(fullName="point_variant_cutoff", shortName="XPV", doc="confidence cutoff for variant (snp) in point genotype", required=true ) public Double POINT_VAR_CUTOFF; - @Argument(fullName="indel_consensus_cutoff", shortName="XIC", doc="confidence cutoff for consensus in indel genotype", required=true ) public Double INDEL_CONS_CUTOFF; - @Argument(fullName="indel_variant_cutoff", shortName="XIV", doc="confidence cutoff for variant (snp) in indel genotype", required=true ) public Double INDEL_VAR_CUTOFF; - @Argument(fullName="log_concordant", shortName="LC",doc="If set, all trio-concordant sites will be logged at level INFO") public boolean LOG_CONCORDANT; - @Argument(fullName="log_discordant", shortName="LD",doc="If set, all trio-discordant sites will be logged at level INFO") public boolean LOG_DISCORDANT; - @Argument(fullName="default_reference_calls",shortName="DRC", - doc="If set, any position where the specified genotype is NOT explicitly specified, while the other is provided, is considered to be an implicit confident 'reference' (no-indel or no-snp) call") - public boolean defCalls; - @Argument(fullName="variant_type", - shortName="VT", - doc="Assess concordance for the variants of the specified type, INDEL or POINT. If genotype track(s) provide both types, the requested one will be selected", - required=true) - public String VTYPE_STR; - - private static Logger logger = Logger.getLogger(MendelianInheritanceWalker.class); - private final static String star = new String("*"); - private GenotypeUtils.VariantType VARIANT_TYPE; - - @Override - public TrioConcordanceRecord map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { - -// String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size()); - - ReferenceOrderedDatum rodMom = rodData.lookup("mother", null); - ReferenceOrderedDatum rodDad = rodData.lookup("father", null); - ReferenceOrderedDatum rodKid = rodData.lookup("daughter", null); - - Genotype mom = GenotypeUtils.extractGenotype(rodMom,VARIANT_TYPE,defCalls); - Genotype dad = GenotypeUtils.extractGenotype(rodDad,VARIANT_TYPE,defCalls); - Genotype kid = GenotypeUtils.extractGenotype(rodKid,VARIANT_TYPE,defCalls); - - return assessGenotypesInTrio(mom, dad, kid); - } - - -/** - * @Override - * @see org.broadinstitute.sting.gatk.walkers.Walker#initialize() - */ - public void initialize() { - super.initialize(); - VARIANT_TYPE = GenotypeUtils.VariantType.valueOf(VTYPE_STR.toUpperCase()); - }; - - - /** Takes a single genotype object and returns properly filled new assessment object (covered/assessed/ref/variant set to 0/1 - * according to what the genotype says) - * @param g - * @return - */ - protected GenotypingCallStats assessGenotype(Genotype g, GenotypingCallStats stats) { - - if ( g != null ) stats.covered = 1; - - if ( hasCall(g)) { - stats.assessed = 1; - if ( g.isReference() ) stats.ref = 1; - else { - stats.variant = 1; - if ( ! g.isBiallelic() ) stats.non_biallelic_variant = 1; - } - } - return stats; - } - - public TrioConcordanceRecord assessGenotypesInTrio(Genotype gMom, Genotype gDad, Genotype gKid) { - - TrioConcordanceRecord t = new TrioConcordanceRecord(); - -// String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size()); - - // first get separate stats on each individual - assessGenotype(gMom,t.mom); - assessGenotype(gDad,t.dad); - assessGenotype(gKid,t.kid); - - // if ( hasCall(mom) && mom.isIndel() ) System.out.println("GOT INDEL: "+mom.toString()); - - if ( t.mom.covered == 0 || t.dad.covered == 0 || t.kid.covered== 0 ) return t; // current position is not covered in at least one individual, there's nothing else to do - t.trio.covered = 1; // ok, base covered in a trio (e.g. in all individuals) - - if ( t.mom.assessed != 0 && t.dad.assessed != 0 && t.kid.assessed != 0 ) { - t.trio.assessed = 1; // assessed in trio = assessed in each individual - } else return t; // at least one individual is not assessed, nothing left to do - - // we are here only if everyone is assessed - - // NOTE: "consistent_ref" in individuals is counted only when all 3 are assessed AND all 3 are ref (i.e. ref call in a trio) - if( gMom.isReference() && gDad.isReference() && gKid.isReference() ) { // everyone is a ref - t.trio.ref = t.trio.consistent_ref = 1; - t.mom.consistent_ref = 1; - t.dad.consistent_ref = 1; - t.kid.consistent_ref = 1; - return t; // done - } - - // by now we know that there's a variant in at least one of the individuals - - t.trio.variant = 1; - - if ( t.mom.non_biallelic_variant == 1 || t.dad.non_biallelic_variant == 1 || t.kid.non_biallelic_variant == 1 ) { - t.trio.non_biallelic_variant = 1; - return t; - } - - String kid_allele_1 = gKid.getFWDAlleles().get(0); - String kid_allele_2 = gKid.getFWDAlleles().get(1); - List mom_alleles = gMom.getFWDAlleles(); - List dad_alleles = gDad.getFWDAlleles(); - - // warning: no special processing for X/Y chromosomes yet; not an issue for daughter - - - - if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) || - mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) { - t.trio.consistent_variant = 1; - if ( ! gMom.isReference() ) { - t.mom.consistent_variant = 1 ; - if ( ! gKid.isReference() ) t.mom_passed_variant = 1; - } - if ( ! gDad.isReference() ) { - t.dad.consistent_variant = 1 ; - if ( ! gKid.isReference() ) t.dad_passed_variant = 1; - } - if ( ! gKid.isReference() ) t.kid.consistent_variant = 1 ; - - if ( LOG_CONCORDANT ) { - logger.info("consistent variant at "+gMom.getLocation() + - "("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " + - genotypeString(gDad) + " kid: " + genotypeString(gKid) - ); - } - } else { - // we are inconsistent. let's see what happened: - - t.trio.inconsistent_variant = 1; - - if ( ! gMom.isReference() ) t.mom.inconsistent_variant = 1 ; - if ( ! gDad.isReference() ) t.dad.inconsistent_variant = 1 ; - if ( ! gKid.isReference() ) t.kid.inconsistent_variant = 1; - - if ( gKid.isReference() && ( ! gMom.isReference() && gMom.isHom() || ! gDad.isReference() && gDad.isHom() ) ) t.missing_variant_in_kid = 1; - if ( ! gKid.isReference() && gMom.isReference() && gDad.isReference() ) t.missing_variant_in_parents = 1; - if ( ! gKid.isReference() && ( ! gMom.isReference() || ! gDad.isReference() ) ) t.nonmatching_variant_in_kid = 1; - - if ( LOG_DISCORDANT ) { - logger.info("inconsistent variant at "+gMom.getLocation() + - "("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " + - genotypeString(gDad) + " kid: " + genotypeString(gKid) - ); - } - - } - - return t; - - } - - - protected String alleleString(Genotype g, int n) { - if ( g.getFWDAlleles().get(n).length() == 0 ) return star; - return g.getFWDAlleles().get(n); - } - - protected String genotypeString(Genotype g) { - return alleleString(g, 0) +"/"+alleleString(g, 1); - } - - @Override - public TrioConcordanceRecord reduce(TrioConcordanceRecord value, TrioConcordanceRecord sum) { - return sum.add(value); - } - - @Override - public TrioConcordanceRecord reduceInit() { - - return new TrioConcordanceRecord(); - } - - boolean hasCall(Genotype g) { - if ( g == null ) return false; // there's no call if there's no rod data available, duh! - - if ( g.isReference() ) { - if ( g.isPointGenotype() ) return g.getConsensusConfidence() >= POINT_CONS_CUTOFF ; - else return g.getConsensusConfidence() >= INDEL_CONS_CUTOFF ; - } - else { // it's a variant - if ( g.isPointGenotype() ) return g.getVariantConfidence() >= POINT_VAR_CUTOFF ; - else return g.getVariantConfidence() >= INDEL_VAR_CUTOFF ; - } - - } - - - /* - protected String shortLine(Genotype av) { - - if ( av == null ) return new String( ""); - - if ( av.isReference() ) return new String ("*"); - - List alleles = av.getFWDAlleles(); - - if ( av.isSNP() ) { - if ( alleles.get(0).charAt(0) == av.getRef() ) return alleles.get(1); - else return alleles.get(0); - } - if ( av.isInsertion() ) { - if ( alleles.get(0).length() == 0 ) return new String('+'+alleles.get(1)); - else return new String('+'+alleles.get(0)); - } - if ( av.isDeletion() ) { - if ( alleles.get(0).length() == 0 ) return new String ('-'+alleles.get(0)); - else return new String('-'+alleles.get(1)); - } - if ( av.isIndel() ) { - return new String('?'+alleles.get(0)); - } - return new String(""); - } - */ -} diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/TrioConcordanceRecord.java b/archive/java/src/org/broadinstitute/sting/mendelian/TrioConcordanceRecord.java deleted file mode 100644 index 4bf7c332d..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/TrioConcordanceRecord.java +++ /dev/null @@ -1,133 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -import org.broadinstitute.sting.utils.Utils; - -/** - * This class is a trivial wrapper for keeping together and passing around counts of different possible outcomes of - * comparisons in a trio. Mendelian walker uses this class to classify/accumulate events such as consistent snp, inconsistent snp - * (e.g. only in a kid, but not in parents), loci with no calls etc. - * @author asivache - * - */ -public class TrioConcordanceRecord { - - public GenotypingCallStats mom; - public GenotypingCallStats dad; - public GenotypingCallStats kid; - - public GenotypingCallStats trio; - -// public long mom_assessed_ref; // number of ref calls in mother on positions assessed *in all 3 individuals* -// public long dad_assessed_ref; // ditto -// public long kid_assessed_ref; -// public int mom_assessed_variant; // number of variant calls in mother on positions assessed *in all 3 individuals* -// public int dad_assessed_variant; // ditto -// public int kid_assessed_variant; - public int missing_variant_in_kid; - public int nonmatching_variant_in_kid; - public int missing_variant_in_parents; - public int mom_passed_variant; - public int dad_passed_variant; - - // public long consistent_ref = 0; // number of assessed loci, where all 3 people have homogeneous reference allele -// public int consistent_variant = 0; // number of assessed loci where a variant is observed in at least one individual and genotyping calls are consistent between the trio members -// public int inconsistent_variant = 0; // number of assessed loci where a variant is observed in at least one individual and genotyping calls are inconsistent -// public int missing_variant_in_parents = 0; // number of inconsistent variants (see above), where parent(s) have a variant but the kid does not while she should -// public int missing_variant_in_kid = 0; // number of inconsistent variants (see above), where kid has a snp but the parents do not while they should -// public int consistent_variant_passed = 0; // variants that are consistent and *passed* (i.e. present in kid and one of the parents) -// public int non_biallelic_variant = 0; // number of variant calls that are not biallelic -// public long unclassified_events = 0; - - public TrioConcordanceRecord() { - mom = new GenotypingCallStats(); - dad = new GenotypingCallStats(); - kid = new GenotypingCallStats(); - trio = new GenotypingCallStats(); - } - - public TrioConcordanceRecord add(TrioConcordanceRecord other) { - - this.mom.add(other.mom); - this.dad.add(other.dad); - this.kid.add(other.kid); - - this.trio.add(other.trio); - -// this.mom_assessed_ref += other.mom_assessed_ref; -// this.dad_assessed_ref += other.dad_assessed_ref; -// this.kid_assessed_ref += other.kid_assessed_ref; -// this.mom_assessed_variant += other.mom_assessed_variant; -// this.dad_assessed_variant += other.dad_assessed_ref; -// this.kid_assessed_variant += other.kid_assessed_variant; - this.missing_variant_in_kid += other.missing_variant_in_kid ; - this.nonmatching_variant_in_kid += other.nonmatching_variant_in_kid ; - this.missing_variant_in_parents += other.missing_variant_in_parents ; - this.mom_passed_variant += other.mom_passed_variant; - this.dad_passed_variant += other.dad_passed_variant; - - // this.consistent_ref += other.consistent_ref; -// this.consistent_variant += other.consistent_variant; -// this.inconsistent_variant += other.inconsistent_variant; -// this.missing_variant_in_parents += other.missing_variant_in_parents; -// this.missing_variant_in_kid += other.missing_variant_in_kid; -// this.consistent_variant_passed += other.consistent_variant_passed; -// this.non_biallelic_variant += other.non_biallelic_variant; -// this.unclassified_events += other.unclassified_events; - return this; - } - - public int totalVariants() { return trio.consistent_variant + trio.inconsistent_variant + trio.non_biallelic_variant; } - - public String toString() { - StringBuilder b = new StringBuilder(); - - b.append(String.format("%ncovered in trio: %d%n", trio.covered ) ); - - b.append(String.format("assessed in trio: %d (%3.2f%% covered)%n", - trio.assessed, Utils.percentage(trio.assessed,trio.covered )) ); - - b.append(String.format(" reference in all samples: %d (%3.2f%% assessed)%n", - trio.ref, Utils.percentage(trio.ref,trio.assessed )) ); - - b.append(String.format(" variant sites: %d (%3.2f%% assessed, or 1 per %3.2f kB)%n", - totalVariants(), Utils.percentage(totalVariants(), trio.assessed), ((double)trio.assessed/totalVariants())/1000.0 - )); - - b.append(String.format(" consistent variants: %d (%3.2f%% variants)%n", - trio.consistent_variant, Utils.percentage(trio.consistent_variant,totalVariants()) - )); - -// b.append(String.format(" passed (in daughter and parent(s)): %d%n lost (in parent(s) but not in daughter): %d%n", -// consistent_variant_passed, consistent_variant - consistent_variant_passed)); - - b.append(String.format(" multiallelic variant: %d (%3.2f%% variants)%n", - trio.non_biallelic_variant, Utils.percentage(trio.non_biallelic_variant, totalVariants()) - )); - - b.append(String.format(" inconsistent variant: %d (%3.2f%% variants)%n", - trio.inconsistent_variant, Utils.percentage(trio.inconsistent_variant, totalVariants()) - )); - - b.append(String.format(" missing from daughter: %d (%3.2f%% inconsistent variants)%n", - missing_variant_in_kid, Utils.percentage(missing_variant_in_kid, trio.inconsistent_variant) - )); - - b.append(String.format(" missing from both parents: %d (%3.2f%% inconsistent variants)%n", - missing_variant_in_parents, Utils.percentage(missing_variant_in_parents, trio.inconsistent_variant) - )); - - b.append(String.format(" non-matching in daughter: %d (%3.2f%% inconsistent variants)%n", - nonmatching_variant_in_kid, Utils.percentage(nonmatching_variant_in_kid, trio.inconsistent_variant) - )); - - b.append("per trio individual:\n"); - b.append(" mother:\n"); - b.append(mom.toString()); - b.append(" father:\n"); - b.append(dad.toString()); - b.append(" daughter:\n"); - b.append(kid.toString()); - - return b.toString(); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/TrioGenotyperWalker.java b/archive/java/src/org/broadinstitute/sting/mendelian/TrioGenotyperWalker.java deleted file mode 100755 index 8579c9731..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/TrioGenotyperWalker.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.MutableVariantContext; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.StandardVCFWriter; -import org.broad.tribble.vcf.VCFConstants; -import org.broad.tribble.vcf.VCFWriter; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.gatk.walkers.varianteval.MendelianViolationEvaluator; - -import java.util.*; -import java.io.File; - -/** - * Implements an (STILL BEING TESTED) algorithm for calling SNPs in trios - */ - -@By(DataSource.REFERENCE) -//@Requires(value={DataSource.REFERENCE, DataSource.REFERENCE_BASES, DataSource.READS},referenceMetaData={@RMD(name="sites",type= VariationRod.class)}) -@Allows({DataSource.READS, DataSource.REFERENCE}) -//, @RMD(name="parent1",type= VariationRod.class), @RMD(name="parent2",type= VariationRod.class)}) -public class TrioGenotyperWalker extends RefWalker{ - @Argument(shortName="mom", doc="", required=true) - protected String mom; - - @Argument(shortName="dad", doc="", required=true) - protected String dad; - - @Argument(shortName="kid", doc="", required=true) - protected String kid; - - @Argument(shortName="log10PriorOfDeNovoOfTrueVariant", doc="", required=false) - double LOG10_MENDEL_VIOLATION_PRIOR = -5; // 30 in 3B bases - - @Argument(shortName = "varout", doc = "File to which variants should be written", required = true) - public String vcfOutputFile = null; - - @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - UnifiedGenotyperEngine UGEngine = null; - private List FAMILY_MEMBERS; - private VCFWriter writer = null; - - public void initialize() { - UGEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, null); - // initialize the header - FAMILY_MEMBERS = Arrays.asList(mom, dad, kid); - - // initialize the writer - writer = new StandardVCFWriter(new File(vcfOutputFile)); - } - - public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc = tracker.getVariantContext(ref, "variants", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), true); - - if ( vc != null && vc.isPolymorphic() ) { - if ( ! vc.hasGenotypes(FAMILY_MEMBERS) ) - throw new StingException("variants file does not contain genotypes for everyone in family: " + FAMILY_MEMBERS); - - VariantCallContext call = UGEngine.runGenotyper(tracker, ref, context); - - // is call ever be null? - vc = annotateTrioPrior(vc, call.vc); - - return vc; - } else { - return null; - } - } - - private VariantContext annotateTrioPrior(VariantContext vcIn, VariantContext call) { - Genotype momG = call.getGenotype(mom); - Genotype dadG = call.getGenotype(dad); - Genotype kidG = call.getGenotype(kid); - - double log10POfGenotype = Double.MIN_VALUE; - if ( MendelianViolationEvaluator.isViolation(call, momG, dadG, kidG) ) { - Allele R = call.getReference(); - Allele A = call.getAlternateAllele(0); - - List> possibleGenotypes = Arrays.asList(Arrays.asList(R,R), Arrays.asList(R,A), Arrays.asList(A,A)); - - double[] L = new double[3 * 3 * 3]; - int i = 0, bestIndex = 0; - double log10LOfBestGenotypes = Integer.MIN_VALUE; - for ( List momPG : possibleGenotypes ) { - for ( List dadPG : possibleGenotypes ) { - for ( List kidPG : possibleGenotypes ) { - double log10LOfG = genotypeL(momPG, momG) + genotypeL(dadPG, dadG) + genotypeL(kidPG, kidG); - boolean isViolation = MendelianViolationEvaluator.isViolation(call, momPG, dadPG, kidPG); - double log10prior = isViolation ? LOG10_MENDEL_VIOLATION_PRIOR : 0; - L[i] = log10LOfG + log10prior; - - if ( log10LOfG > log10LOfBestGenotypes ) { - bestIndex = i; - log10LOfBestGenotypes = log10LOfG; - } - logger.debug(String.format("%10s %10s => %10s : %b\t%.2f\t\t%.2f\t\t%3.2f", momPG, dadPG, kidPG, isViolation, log10LOfG, log10prior, L[i])); - i++; - } - } - } - - double[] posteriors = MathUtils.normalizeFromLog10(L, true); - log10POfGenotype = posteriors[bestIndex]; - } - //log10POfViolation = Math.min(log10POfViolation, 0); - - double Q = QualityUtils.phredScaleCorrectRate(Math.pow(10, log10POfGenotype)); - logger.debug(String.format("log10 P of best genotype log10 post = %.2f, Q = %.2f", log10POfGenotype, Q)); - MutableVariantContext mvc = new MutableVariantContext(vcIn); - mvc.putAttribute("MVQ", Q); - return new VariantContext(mvc); - } - - /** - * Isolate the rest of the walker from the code to get genotype likelihood values for allele A/B in genotypeCall - * @param alleles - * @param genotypeCall - * @return - */ - private double genotypeL( List alleles, Genotype genotypeCall ) { - String postTriplet = (String)genotypeCall.getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - if ( postTriplet == null ) - throw new StingException("BUG: TrioGenotyperWalker expected genotype likelihood triplets " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - - // calculate the offset -- AA => 0, AB => 1, BB => 2 - int i = 0; - for ( Allele a : alleles ) - i += a.isNonReference() ? 1 : 0; - - // convert the corresponding GL field to a double - String log10LStrings[] = postTriplet.split(","); - return Double.valueOf(log10LStrings[i]); - } - - public Integer reduceInit() { return 0; } - public Integer reduce(VariantContext vc, Integer a) { - if ( vc != null ) { - if ( a == 0 ) - writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc)); - - writer.add(vc, (byte)'.'); - a++; - } - - return a; - } - - public void onTraversalDone(Integer result) {} // Don't print the reduce result -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/mendelian/VCFToBeagleWalker.java b/archive/java/src/org/broadinstitute/sting/mendelian/VCFToBeagleWalker.java deleted file mode 100755 index acf5c7ec0..000000000 --- a/archive/java/src/org/broadinstitute/sting/mendelian/VCFToBeagleWalker.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.walkers.varianteval.MendelianViolationEvaluator; - -import java.util.EnumSet; -import java.util.Arrays; - -/** - * Test routine for new VariantContext object - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class)}) -public class VCFToBeagleWalker extends RodWalker { - @Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false) - protected String TRIO_STRUCTURE; - - private MendelianViolationEvaluator.TrioStructure trio = null; - - public class Result { - int nVariants, nConverted; - } - - public void initialize() { - if ( TRIO_STRUCTURE != null ) { - trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE); - out.printf("I id %s%n", Utils.join(" ", Arrays.asList(trio.mom, trio.mom, trio.dad, trio.dad, trio.child, trio.child))); - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref != null ) { - EnumSet allowedTypes = EnumSet.of(VariantContext.Type.SNP); - - VariantContext vc = tracker.getVariantContext(ref, "variants", allowedTypes, context.getLocation(), false); - - if ( vc != null && vc.isBiallelic() && vc.isNotFiltered() ) { - if ( trio != null ) { // we are emitting a trio file - if ( ! vc.hasGenotypes() || vc.getGenotypes().size() != 3 ) - throw new StingException("Convertion exception: Trio conversion requires exactly three genotypes at every locus: " + vc); - - if ( genotypesAreGood(vc) ) { - if ( ! genotypesAreGoodForTrios(vc, trio) ) { - logger.debug("VC excluded due to poor trio genotyping " + vc); - } else { - Genotype mom = vc.getGenotype(trio.mom); - Genotype dad = vc.getGenotype(trio.dad); - Genotype child = vc.getGenotype(trio.child); - - // beagle format looks like: - // - // I id 1001 1001 1002 1002 1003 1003 - // A diabetes 1 1 2 2 2 2 - // M rs2289311 A G G G A G - String loc = "c" + vc.getLocation().getContig() + "_p" + vc.getLocation().getStart(); - out.printf("M %s %s %s %s%n", loc, genotype2BeagleString(mom), genotype2BeagleString(dad), genotype2BeagleString(child)); - return 1; - } - } - } else { - throw new IllegalArgumentException("VCFToBeagle currently only supports conversion of trios. Complain to mark"); - } - } - } - - return 0; - } - - private String genotype2BeagleString(Genotype g) { - return allele2BeagleString(g.getAllele(0)) + " " + allele2BeagleString(g.getAllele(1)); - } - - private String allele2BeagleString(Allele a) { - return new String(a.getBases()); - } - - private static boolean genotypesAreGood(VariantContext vc) { - for ( Genotype g : vc.getGenotypes().values() ) { - if ( g.isFiltered() ) - return false; - } - - return true; - } - - private static boolean genotypesAreGoodForTrios(VariantContext vc, MendelianViolationEvaluator.TrioStructure trio) { - return ! MendelianViolationEvaluator.isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child)); - } - - public Result reduceInit() { - return new Result(); - } - - public Result reduce(Integer point, Result sum) { - sum.nVariants++; - sum.nConverted += point; - return sum; - } - - public void onTraversalDone(Result result) { - logger.info(String.format("Saw %d raw SNPs", result.nVariants)); - logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants)); - } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java deleted file mode 100755 index 88a4ef3aa..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java +++ /dev/null @@ -1,186 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import org.broadinstitute.sting.utils.*; - -import net.sf.samtools.*; - -import java.util.List; -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Apr 14, 2009 - * Time: 8:54:05 AM - * To change this template use File | Settings | File Templates. - */ -abstract public class BasicPileup { - public static final char DELETION_CHAR = 'D'; - - @Deprecated - abstract GenomeLoc getLocation(); - @Deprecated - abstract char getRef(); - @Deprecated - abstract int size(); - - - /** - * This is the right way to get bases - * - * @return - */ - @Deprecated - byte[] getBases() { return null; } - - /** - * This is the right way to get quals - * - * @return - */ - @Deprecated - byte[] getQuals() { return null; } - - /** - * This is a terrible way to get bases. Use getBases() or getBasesAsArrayList() - * - * @return - */ - @Deprecated - String getBasesAsString() { return null; } - - /** - * This is a terrible way to get quals. Use getQuals() or getQualsAsArrayList() - * - * @return - */ - @Deprecated - String getQualsAsString() { return null; } - - public String getPileupString() { - return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString()); - } - - // - // ArrayList methods - // - public static byte[] getBasesAsArray( List reads, List offsets ) { - byte array[] = new byte[reads.size()]; - int index = 0; - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if ( offset == -1 ) { - array[index++] = ((byte)DELETION_CHAR); - } else { - array[index++] = read.getReadBases()[offset]; - } - } - return array; - } - - @Deprecated - public static ArrayList getBasesAsArrayList( List reads, List offsets ) { - ArrayList bases = new ArrayList(reads.size()); - for (byte value : getBasesAsArray(reads, offsets)) - bases.add(value); - return bases; - } - - @Deprecated - public static ArrayList getQualsAsArrayList( List reads, List offsets ) { - ArrayList quals = new ArrayList(reads.size()); - for (byte value : getQualsAsArray(reads, offsets)) - quals.add(value); - return quals; - } - - @Deprecated - public static byte[] getQualsAsArray( List reads, List offsets ) { - byte array[] = new byte[reads.size()]; - int index = 0; - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - // skip deletion sites - if ( offset == -1 ) { - array[index++] = ((byte)0); - } else { - array[index++] = read.getBaseQualities()[offset]; - } - } - return array; - } - - @Deprecated - public static ArrayList mappingQualPileup( List reads) { - ArrayList quals = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - byte qual = (byte)read.getMappingQuality(); - quals.add(qual); - } - return quals; - } - - @Deprecated // todo -- delete me - public static String[] indelPileup( List reads, List offsets ) - { - String[] indels = new String[reads.size()]; - - for (int i = 0; i < reads.size(); i++) - { - SAMRecord read = reads.get(i); - Cigar cigar = read.getCigar(); - int offset = offsets.get(i); - - String cigar_string = read.getCigarString(); - if (! (cigar_string.contains("I") || cigar_string.contains("D"))) { indels[i] = "null"; continue; } - - //System.out.printf("%s:%d %s %s %s ", read.getReferenceName(), read.getAlignmentStart(), read.getReadName(), read.getReadString(), cigar_string); - int k = 0; - for (int j = 0; j < cigar.numCigarElements(); j++) - { - CigarOperator operator = cigar.getCigarElement(j).getOperator(); - int length = cigar.getCigarElement(j).getLength(); - if (operator == CigarOperator.M) - { - k += length; - } - else if ((k == offset+1) && (operator == CigarOperator.I)) - { - // this insertion is associated with this offset (kinda ;) ). - indels[i] = read.getReadString().substring(k, k+length); - //System.out.printf("(I,%d,%d)", k, offset); - break; - } - else if ((k != offset+1) && (operator == CigarOperator.I)) - { - //System.out.printf("(i,%d,%d)", k, offset); - k += length; - } - else if ((k == offset) && (operator == CigarOperator.D)) - { - // this deletion is associated with this offset. - indels[i] = length + "D"; - //System.out.printf("(D,%d,%d)", k, offset); - break; - } - else if (k >= offset) - { - // no indel here. - indels[i] = "null"; - //System.out.printf("(N,%d,%d)", k, offset); - break; - } - } - if (indels[i] == null) { indels[i] = "null"; } - //System.out.printf("\n"); - } - - return indels; - } -} - - diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java deleted file mode 100644 index 8209fc949..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java +++ /dev/null @@ -1,508 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import org.broadinstitute.sting.utils.MathUtils; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; -import java.lang.Cloneable; - -public class ClassicGenotypeLikelihoods implements Cloneable { - // precalculate these for performance (pow/log10 is expensive!) - private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; - - static { - for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { - oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0))); - //oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual)); - } - } - - private static double getOneMinusQual(final byte qual) { - return oneMinusData[qual]; - } - - private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; - - static { - for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { - oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0); - //oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0); - } - } - - private static double getOneHalfMinusQual(final byte qual) { - return oneHalfMinusData[qual]; - } - - public double[] likelihoods; - public String[] genotypes; - public int coverage; - - // The genotype priors; - private double priorHomRef; - private double priorHet; - private double priorHomVar; - public String[] sorted_genotypes; - public double[] sorted_likelihoods; - double ref_likelihood = Double.NaN; - private IndelLikelihood indel_likelihood; - - // Store the 2nd-best base priors for on-genotype primary bases - //private HashMap onNextBestBasePriors = new HashMap(); - - // Store the 2nd-best base priors for off-genotype primary bases - //private HashMap offNextBestBasePriors = new HashMap(); - - private static double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; - private static double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; - - public ClassicGenotypeLikelihoods() { - initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods(boolean foo) { - } - - public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods clone() { - ClassicGenotypeLikelihoods c = new ClassicGenotypeLikelihoods(false); - c.likelihoods = this.likelihoods.clone(); - c.genotypes = this.genotypes.clone(); - c.coverage = this.coverage; - - // The genotype priors; - c.priorHomRef = this.priorHomRef; - c.priorHet = this.priorHet; - c.priorHomVar = this.priorHomVar; - //public String[] sorted_genotypes; - //public double[] sorted_likelihoods; - //double ref_likelihood = Double.NaN; - //private IndelLikelihood indel_likelihood; - return c; - } - - private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { - this.priorHomRef = priorHomRef; - this.priorHet = priorHet; - this.priorHomVar = priorHomVar; - - likelihoods = new double[10]; - genotypes = new String[10]; - coverage = 0; - - for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } - - genotypes[0] = "AA"; - genotypes[1] = "AC"; - genotypes[2] = "AG"; - genotypes[3] = "AT"; - genotypes[4] = "CC"; - genotypes[5] = "CG"; - genotypes[6] = "CT"; - genotypes[7] = "GG"; - genotypes[8] = "GT"; - genotypes[9] = "TT"; - - //for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - // onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); - // offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); - //} - } - - public double getHomRefPrior() { - return priorHomRef; - } - - public void setHomRefPrior(double priorHomRef) { - this.priorHomRef = priorHomRef; - } - - public double getHetPrior() { - return priorHet; - } - - public void setHetPrior(double priorHet) { - this.priorHet = priorHet; - } - - public double getHomVarPrior() { - return priorHomVar; - } - - public void setHomVarPrior(double priorHomVar) { - this.priorHomVar = priorHomVar; - } - -// public double[] getOnGenotypeSecondaryPriors() { -// double[] p2ndon = new double[10]; -// -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); -// } -// -// return p2ndon; -// } -// -// public void setOnGenotypeSecondaryPriors(double[] p2ndon) { -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); -// } -// } -// -// public double[] getOffGenotypeSecondaryPriors() { -// double[] p2ndoff = new double[10]; -// -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); -// } -// -// return p2ndoff; -// } -// -// public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); -// } -// } - - public void add(char ref, char read, byte qual) - { - if (qual <= 0) { qual = 1; } - - if (coverage == 0) - { - for (int i = 0; i < likelihoods.length; i++) - { - likelihoods[i] = 0; - } - } - double sum = 0.0; - for (int i = 0; i < genotypes.length; i++) - { - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); - likelihoods[i] += likelihood; - } - coverage += 1; - } - - public void add(char ref, char read, byte qual, ConfusionMatrix matrix, String platform) - { - if (qual <= 0) { qual = 1; } - if (platform == null) { platform = "ILLUMINA"; } - if (read == 'N') { return; } - - if (coverage == 0) - { - for (int i = 0; i < likelihoods.length; i++) - { - likelihoods[i] = 0; - } - } - double sum = 0.0; - for (int i = 0; i < genotypes.length; i++) - { - char h1 = genotypes[i].charAt(0); - char h2 = genotypes[i].charAt(1); - - double p1 = matrix.lookup(platform, read, h1); - double p2 = matrix.lookup(platform, read, h2); - - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual, p1, p2); - - //System.out.printf("DBG: %c %c %s %d %f %f %f\n", ref, read, genotypes[i], qual, p1, p2, likelihood); - - likelihoods[i] += likelihood; - } - coverage += 1; - } - - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { - if (qual == 0) { - qual = 1; - } // zero quals are wrong. - - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); - - double p_base; - - if ((h1 == h2) && (h1 == read)) { - // hom - p_base = getOneMinusQual(qual); - } else if ((h1 != h2) && ((h1 == read) || (h2 == read))) { - // het - p_base = getOneHalfMinusQual(qual); - } else { - // error - p_base = qual / -10.0; - } - - return p_base; - } - - public void TEST() - { - double p_A2A = 1.00; - double p_T2T = 1.00; - - double p_A2T = 0.75; - double p_T2A = 0.25; - - char ref = 'A'; - - System.out.printf("\tA\tT\n"); - System.out.printf("A\t%.02f\t%.02f\n", p_A2A, p_A2T); - System.out.printf("T\t%.02f\t%.02f\n", p_T2A, p_T2T); - System.out.printf("\n"); - - System.out.printf("P(A,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'A', "AA", (byte)20, p_A2A, p_A2A)); - System.out.printf("P(A,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'A', "AT", (byte)20, p_A2A, p_A2T)); - System.out.printf("P(A,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'A', "TT", (byte)20, p_A2T, p_A2T)); - - System.out.printf("P(T,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'T', "AA", (byte)20, p_T2A, p_T2A)); - System.out.printf("P(T,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'T', "AT", (byte)20, p_T2A, p_T2T)); - System.out.printf("P(T,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'T', "TT", (byte)20, p_T2T, p_T2T)); - - //System.exit(0); - } - - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual, double p1, double p2) { - if (qual == 0) { - qual = 1; - } // zero quals are wrong. - - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); - - double perr = Math.pow(10.0,qual/-10.0); - - double p_base = 0; - - if (read == h1) - { - p_base += (1.0 - perr); - } - else - { - p_base += perr * p1; - } - - if (read == h2) - { - p_base += (1.0 - perr); - } - else - { - p_base += perr * p2; - } - - p_base = Math.log10(p_base/2.0); - - return p_base; - } - - public void sort() { - Integer[] permutation = MathUtils.sortPermutation(likelihoods); - - Integer[] reverse_permutation = new Integer[permutation.length]; - for (int i = 0; i < reverse_permutation.length; i++) { - reverse_permutation[i] = permutation[(permutation.length - 1) - i]; - } - - sorted_genotypes = MathUtils.permuteArray(genotypes, reverse_permutation); - sorted_likelihoods = MathUtils.permuteArray(likelihoods, reverse_permutation); - } - - public String toString(char ref) { - this.sort(); - double sum = 0; - String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); - for (int i = 0; i < sorted_genotypes.length; i++) { - if (i != 0) { - s = s + " "; - } - s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); - sum += Math.pow(10,sorted_likelihoods[i]); - } - s = s + String.format(" %f", sum); - return s; - } - - public void ApplyPrior(char ref, double[] allele_likelihoods) - { - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - if (i == j) - { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); - } - else - { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); - } - k++; - } - } - this.sort(); - } - - public void ApplyPrior(char ref, char alt, double p_alt) { - for (int i = 0; i < genotypes.length; i++) { - if ((p_alt == -1) || (p_alt <= 1e-6)) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - // hom-ref - likelihoods[i] += Math.log10(priorHomRef); - } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { - // hom-nonref - likelihoods[i] += Math.log10(priorHomVar); - } else { - // het - likelihoods[i] += Math.log10(priorHet); - } - if (Double.isInfinite(likelihoods[i])) { - likelihoods[i] = -1000; - } - } else { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - // hom-ref - likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); - } else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt)) { - // hom-nonref - likelihoods[i] += 2.0 * Math.log10(p_alt); - } else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) || - ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt))) { - // het - likelihoods[i] += Math.log10((1.0 - p_alt) * p_alt * 2.0); - } else { - // something else (noise!) - likelihoods[i] += Math.log10(1e-5); - } - - if (Double.isInfinite(likelihoods[i])) { - likelihoods[i] = -1000; - } - } - } - this.sort(); - } - - public void ApplyWeight(double weight) - { - double log10weight = Math.log10(weight); - for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += log10weight; } - this.sort(); - } - -// public void applySecondBaseDistributionPrior(String primaryBases, String secondaryBases) { -// for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) { -// char firstAllele = genotypes[genotypeIndex].charAt(0); -// char secondAllele = genotypes[genotypeIndex].charAt(1); -// -// int offIsGenotypic = 0; -// int offTotal = 0; -// -// int onIsGenotypic = 0; -// int onTotal = 0; -// -// for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) { -// char primaryBase = primaryBases.charAt(pileupIndex); -// -// if (secondaryBases != null) { -// char secondaryBase = secondaryBases.charAt(pileupIndex); -// -// if (primaryBase != firstAllele && primaryBase != secondAllele) { -// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { -// offIsGenotypic++; -// } -// offTotal++; -// } else { -// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { -// onIsGenotypic++; -// } -// onTotal++; -// } -// } -// } -// -// double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex])); -// double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex])); -// -// likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior); -// } -// this.sort(); -// } - - public double LodVsNextBest() { - this.sort(); - return sorted_likelihoods[0] - sorted_likelihoods[1]; - } - - public double LodVsRef(char ref) { - if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { - ref_likelihood = sorted_likelihoods[0]; - return (-1.0 * this.LodVsNextBest()); - } else { - for (int i = 0; i < genotypes.length; i++) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - ref_likelihood = likelihoods[i]; - } - } - } - return sorted_likelihoods[0] - ref_likelihood; - } - - public String BestGenotype() { - this.sort(); - return this.sorted_genotypes[0]; - } - - public double BestPosterior() { - this.sort(); - return this.sorted_likelihoods[0]; - } - - public double RefPosterior(char ref) - { - this.LodVsRef(ref); - return this.ref_likelihood; - } - - public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } - public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; } - -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java deleted file mode 100644 index 8b51648e5..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import java.lang.*; -import java.util.*; -import java.io.*; - -import org.broadinstitute.sting.utils.*; - -public class ConfusionMatrix -{ - private double[][] ILLUMINA; - private double[][] solid; - private double[][] LS454; - - public ConfusionMatrix(String file_name) - { - //System.out.println("DBG: ConfusionMatrix constructor! (" + file_name + ")"); - - ILLUMINA = new double[4][4]; - solid = new double[4][4]; - LS454 = new double[4][4]; - - try - { - Scanner sc = new Scanner(new File(file_name)); - while (sc.hasNext()) - { - String platform = sc.next(); - char read = sc.next().charAt(0); - char ref = sc.next().charAt(0); - double p = sc.nextDouble(); - - int read_i = BaseUtils.simpleBaseToBaseIndex(read); - int ref_i = BaseUtils.simpleBaseToBaseIndex(ref); - - if (platform.equals("ILLUMINA")) { ILLUMINA[read_i][ref_i] = p; } - if (platform.equals("solid")) { solid[read_i][ref_i] = p; } - if (platform.equals("LS454")) { LS454[read_i][ref_i] = p; } - - //System.out.println("DBG: " + key + " " + p); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - } - - double lookup(String platform, char read, char truth) - { - int read_i = BaseUtils.simpleBaseToBaseIndex(read); - int truth_i = BaseUtils.simpleBaseToBaseIndex(truth); - - double d = 0; - - if (platform.equals("ILLUMINA")) { d = ILLUMINA[read_i][truth_i]; } - else if (platform.equals("solid")) { d = solid[read_i][truth_i]; } - else if (platform.equals("LS454")) { d = LS454[read_i][truth_i]; } - else { assert(false); } - - return d; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/EMGenotypeCalculationModel.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/EMGenotypeCalculationModel.java deleted file mode 100755 index a374a7d45..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/EMGenotypeCalculationModel.java +++ /dev/null @@ -1,191 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.genotype.*; - -import java.util.*; - -public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel { - - // We need to set a limit on the EM iterations in case something flukey goes on - protected static final int MAX_EM_ITERATIONS = 8; - - // We consider the EM stable when the MAF doesn't change more than 1/10,000 - protected static final double EM_STABILITY_METRIC = 1e-4; - - protected EMGenotypeCalculationModel() {} - - public VariantCallContext callLocus(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map contexts, DiploidGenotypePriors priors) { - - // run the EM calculation - EMOutput overall = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - - double PofD = Math.pow(10, overall.getPofD()); - double PofNull = Math.pow(10, overall.getPofNull()); - double sum = PofD + PofNull; - - // calculate the phred-scaled confidence score - double phredScaledConfidence; - boolean bestIsRef = false; - if ( PofD > PofNull ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofNull / sum); - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofD / sum); - bestIsRef = true; - } - - // are we above the lod threshold for emitting calls (and not in all-bases mode)? - if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) - return new VariantCallContext(phredScaledConfidence >= CONFIDENCE_THRESHOLD); - - // generate the calls - List calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts); - - VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestIsRef ? Variation.VARIANT_TYPE.REFERENCE : Variation.VARIANT_TYPE.SNP); - if ( locusdata != null ) { - if ( locusdata instanceof ConfidenceBacked ) { - ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); - } - if ( locusdata instanceof IDBacked ) { - rodDbSNP dbsnp = getDbSNP(tracker); - if ( dbsnp != null ) - ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); - } - if ( locusdata instanceof SLODBacked && REPORT_SLOD ) { - - // calculate strand score - - double lod = overall.getPofD() - overall.getPofNull(); - logger.debug(String.format("LOD=%f", lod)); - - EMOutput forward = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.FORWARD); - EMOutput reverse = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.REVERSE); - double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); - double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); - logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - double strandScore = Math.max(forwardLod - lod, reverseLod - lod); - logger.debug(String.format("SLOD=%f", strandScore)); - // rescale by a factor of 10 - strandScore *= 10.0; - - ((SLODBacked)locusdata).setSLOD(strandScore); - } - locusdata.setNonRefAlleleFrequency(overall.getMAF()); - - // finally, associate the Variation with the Genotypes - locusdata.setGenotypeCalls(calls); - for ( Genotype call : calls ) - ((GenotypeCall)call).setVariation(locusdata); - } - return new VariantCallContext(locusdata, calls, phredScaledConfidence >= CONFIDENCE_THRESHOLD); - } - - protected List genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, Map contexts) { - HashMap GLs = results.getGenotypeLikelihoods(); - - ArrayList calls = new ArrayList(); - int variantCalls = 0; - - for ( String sample : GLs.keySet() ) { - - // create the call - AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation()); - - // set the genotype and confidence - double[] posteriors = GLs.get(sample).getPosteriors(); - Integer sorted[] = Utils.SortPermutation(posteriors); - DiploidGenotype bestGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; - DiploidGenotype nextGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]]; - call.setNegLog10PError(posteriors[bestGenotype.ordinal()] - posteriors[nextGenotype.ordinal()]); - call.setGenotype(bestGenotype); - - if ( call instanceof ReadBacked ) { - ReadBackedPileup pileup = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - ((ReadBacked)call).setPileup(pileup); - } - if ( call instanceof SampleBacked ) { - ((SampleBacked)call).setSampleName(sample); - } - if ( call instanceof LikelihoodsBacked ) { - ((LikelihoodsBacked)call).setLikelihoods(GLs.get(sample).getLikelihoods()); - } - if ( call instanceof PosteriorsBacked ) { - ((PosteriorsBacked)call).setPosteriors(posteriors); - } - - calls.add(call); - - // increment the variant count if it's non-ref - if ( call.isVariant(ref) ) - variantCalls++; - } - - // if everyone is ref, don't emit any calls - if ( variantCalls == 0 ) - calls.clear(); - - return calls; - } - - public EMOutput runEM(char ref, Map contexts, DiploidGenotypePriors priors, StratifiedAlignmentContext.StratifiedContextType contextType) { - - // initialize the allele frequencies - initializeAlleleFrequencies(contexts.size(), ref); - - // initialize the genotype likelihoods - initializeGenotypeLikelihoods(ref, contexts, priors, contextType); - - // The EM loop: - // we want to continue until the calculation is stable, but we need some max on the number of iterations - int iterations = 0; - boolean EM_IS_STABLE; - - do { - calculateAlleleFrequencyPosteriors(); - - applyAlleleFrequencyToGenotypeLikelihoods(); - - EM_IS_STABLE = isStable(); - - } while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE ); - - logger.debug("EM loop took " + iterations + " iterations"); - - return computePofF(ref); - } - - protected abstract void initializeAlleleFrequencies(int numSamplesInContext, char ref); - protected abstract void initializeGenotypeLikelihoods(char ref, Map contexts, DiploidGenotypePriors priors, StratifiedAlignmentContext.StratifiedContextType contextType); - protected abstract void calculateAlleleFrequencyPosteriors(); - protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(); - protected abstract boolean isStable(); - protected abstract EMOutput computePofF(char ref); - - - /** - * A class to keep track of the EM output - */ - protected class EMOutput { - private double pD, pNull, pF, MAF; - private HashMap GLs; - - EMOutput(double pD, double pNull, double pF, double MAF, HashMap GLs) { - this.pD = pD; - this.pNull = pNull; - this.pF = pF; - this.MAF = MAF; - this.GLs = GLs; - } - - public double getPofD() { return pD; } - public double getPofNull() { return pNull; } - public double getPofF() { return pF; } - public double getMAF() { return MAF; } - public HashMap getGenotypeLikelihoods() { return GLs; } - } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java deleted file mode 100755 index 71ebd9498..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import java.util.HashMap; - -public class IndelLikelihood { - private String type; - private String[] alleles; - private double p; - private double lod; - - private double pRef; - private double pHet; - private double pHom; - private String alt; - - public IndelLikelihood(String type, String[] alleles, double p, double lod) { - initialize(type, alleles, p, lod); - } - - public IndelLikelihood(String[] indels, double indel_alt_freq) - { - HashMap indel_allele_counts = new HashMap(); - - for (int i = 0; i < indels.length; i++) { - if (! indel_allele_counts.containsKey(indels[i])) { - indel_allele_counts.put(indels[i], 1); - } else { - indel_allele_counts.put(indels[i], indel_allele_counts.get(indels[i])+1); - } - } - - Object[] keys = indel_allele_counts.keySet().toArray(); - String[] alleles = new String[keys.length]; - int[] counts = new int[keys.length]; - //double likelihoods[] = new double[keys.length]; - int null_count = 0; - String max_allele = null; - int max_count = -1; - - if ((keys.length > 0) && (! ((keys.length == 1) && (((String)keys[0]).equals("null"))))) { - for (int i = 0; i < keys.length; i++) { - Integer count = (Integer)indel_allele_counts.get(keys[i]); - alleles[i] = (String)keys[i]; - counts[i] = count; - if (alleles[i].equals("null")) { null_count = counts[i]; } - else if (counts[i] > max_count) { max_count = counts[i]; max_allele = alleles[i]; } - //System.out.printf("%s[%d] ", keys[i], count); - } - //System.out.printf("\n"); - - double eps = 1e-3; - pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq); - pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq); - pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq); - - double lodRef = pRef - Math.max(pHet, pHom); - double lodHet = pHet - pRef; - double lodHom = pHom - pRef; - - //System.out.printf("%s/%s %f %f\n", "null", "null", pRef, lodRef); - //System.out.printf("%s/%s %f %f\n", max_allele, "null", pHet, lodHet); - //System.out.printf("%s/%s %f %f\n", max_allele, max_allele, pHom, lodHom); - //System.out.printf("\n"); - - if (lodRef > 0) { - // reference call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = "null"; - - //return new IndelLikelihood("ref", genotype, pRef, lodRef); - initialize("ref", genotype, pRef, lodRef); - } else if (lodHet > lodHom) { - // het call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = max_allele; - - //return new IndelLikelihood("het", genotype, pHet, lodHet); - initialize("het", genotype, pHet, lodHet); - } else { - // hom call - String[] genotype = new String[2]; - genotype[0] = max_allele; - genotype[1] = max_allele; - - //return new IndelLikelihood("hom", genotype, pHom, lodHom); - initialize("hom", genotype, pHom, lodHom); - } - } - } - - private void initialize(String type, String[] alleles, double p, double lod) { - this.type = type; - this.alleles = alleles; - this.p = p; - this.lod = lod; - } - - public String getAlt() { return alt; } - public double pRef() { return pRef; } - public double pHet() { return pHet; } - public double pHom() { return pHom; } - - public String getType() { return type; } - public String[] getAlleles() { return alleles; } - public double getPosteriorProbability() { return p; } - public double getLOD() { return lod; } - - public String toString() { - return String.format("%s/%s %f %f", alleles[0], alleles[1], p, lod); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ListSampleIds.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ListSampleIds.java deleted file mode 100644 index 290175193..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ListSampleIds.java +++ /dev/null @@ -1,62 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; - -import java.util.List; - -public class ListSampleIds extends LocusWalker -{ - public void initialize() - { - GenomeAnalysisEngine toolkit = this.getToolkit(); - SAMFileHeader header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - out.println(sample_name); - } - } - - public Boolean map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - List reads = context.getReads(); - StringBuilder readNames = new StringBuilder(); - - for ( int i = 0; i < reads.size(); i++ ) - { - SAMRecord read = reads.get(i); - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup == null) { System.out.printf("."); return false; } - String sample = readGroup.getSample(); - System.out.printf("FROM_MAP %s\n", sample); - } - - return true; - } - - public void onTraversalDone() - { - return; - } - - public Boolean reduceInit() - { - return true; - } - - public Boolean reduce(Boolean mapresult, Boolean sum) - { - out.flush(); - return true; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java deleted file mode 100644 index fdc1a61cb..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java +++ /dev/null @@ -1,1093 +0,0 @@ - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.*; -import java.util.zip.GZIPOutputStream; - -// Beta iterative multi-sample caller -// j.maguire 6-11-2009 - -public class MultiSampleCaller extends LocusWalker -{ - @Argument(required=false, shortName="fractional_counts", doc="should we use fractional counts?") public boolean FRACTIONAL_COUNTS = false; - @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; - @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT = null; - @Argument(fullName="individual_output", shortName="individual_output", required=false, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT = null; - @Argument(fullName="stats_output", shortName="stats_output", required=false, doc="file to write stats to") public String STATS_OUTPUT = null; - @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; - @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; - @Argument(fullName="weight_samples", shortName="weight_samples", required=false, doc="rw-weight samples during EM?") public boolean WEIGHT_SAMPLES = false; - - @Argument(fullName="theta", shortName="theta", required=false, doc="rate of sequence divergence") public double THETA = 1e-3; - @Argument(fullName="allele_frequency_prior", shortName="allele_frequency_prior", required=false, doc="use prior on allele frequencies? (P(f) = theta/(N*f)") public boolean ALLELE_FREQUENCY_PRIOR = false; - - @Argument(fullName="confusion_matrix_file", shortName="confusion_matrix_file", required=false, doc="file containing confusion matrix for all three technologies") public String CONFUSION_MATRIX_FILE = null; - - @Argument(fullName="ALLELE_FREQ_TOLERANCE", shortName="AFT", required=false, doc="") - public double ALLELE_FREQ_TOLERANCE = 1e-6; - - @Argument(fullName="append", shortName="append", required=false, doc="if the discovery file already exists, don't re-call sites that are done.") boolean APPEND = false; - - private static final double MIN_LOD_FOR_STRAND = 0.01; - - // Private state. - protected List sample_names; - protected SAMFileHeader header; - protected PrintStream individual_output_file = null; - protected PrintStream discovery_output_file = null; - protected PrintStream stats_output_file = null; - - private boolean INCLUDE_STATS = false; - private boolean INCLUDE_GENOTYPES = false ; - - class MultiSampleCallResult - { - GenomeLoc location; - char ref; - char alt; - EM_Result em_result; - double lod; - double strand_score; - double pD; - double pNull; - String in_dbsnp; - int n_ref; - int n_het; - int n_hom; - int EM_N; - double alt_freq; - - public MultiSampleCallResult(GenomeLoc location, char ref, char alt, EM_Result em_result, double lod, double strand_score, double pD, double pNull, String in_dbsnp, int n_ref, int n_het, int n_hom, int EM_N, double alt_freq) - { - this.location = location; - this.ref = ref; - this.alt = alt; - this.em_result = em_result; - this.lod = lod; - this.strand_score = strand_score; - this.pD = pD; - this.pNull = pNull; - this.in_dbsnp = in_dbsnp; - this.n_ref = n_ref; - this.n_het = n_het; - this.n_hom = n_hom; - this.EM_N = EM_N; - this.alt_freq = alt_freq; - } - - public MultiSampleCallResult() { } // this is just so I can do new MultiSampleCallResult().header(). "inner classes cannot have static declarations" :( - - public String header() - { - return new String("loc ref alt lod strand_score pD pNull in_dbsnp pA pC pG pT EM_alt_freq EM_N n_ref n_het n_hom"); - } - - public String toString() - { - String s = ""; - s = s + String.format("%s %c %c %f %f %f %f %s ", location, ref, alt, lod, strand_score, pD, pNull, in_dbsnp); - for (int i = 0; i < 4; i++) { s = s + String.format("%f ", em_result.allele_likelihoods[i]); } - s = s + String.format("%f %d %d %d %d", alt_freq, em_result.EM_N, n_ref, n_het, n_hom); - return s; - } - } - - public static class DepthStats - { - public static String Header() - { - return "loc ref depth A C G T a c g t mq_min mq_mean mq_median mq_max mq_sd"; - } - - public static String Row(char ref, AlignmentContext context) - { - String ans = ""; - List reads = context.getReads(); - List offsets = context.getOffsets(); - //Pileup pileup = new ReadBackedPileupOld(ref, context); - - ans += String.format("%s ", context.getLocation()); - ans += String.format("%c ", ref); - ans += String.format("%d ", reads.size()); - ans += String.format("%d ", countBase(context, 'A', "+")); - ans += String.format("%d ", countBase(context, 'C', "+")); - ans += String.format("%d ", countBase(context, 'G', "+")); - ans += String.format("%d ", countBase(context, 'T', "+")); - ans += String.format("%d ", countBase(context, 'A', "-")); - ans += String.format("%d ", countBase(context, 'C', "-")); - ans += String.format("%d ", countBase(context, 'G', "-")); - ans += String.format("%d ", countBase(context, 'T', "-")); - - ans += String.format("%s ", Stats(BasicPileup.mappingQualPileup(reads))); - - return ans; - } - - static int countBase(AlignmentContext context, char base, String strand) - { - int count = 0; - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i = 0; i < reads.size(); i++) - { - if (reads.get(i).getReadString().charAt(offsets.get(i)) == base) - { - if (strand.equals("+") && (reads.get(i).getReadNegativeStrandFlag()==false)) { count += 1; } - else if (strand.equals("-") && (reads.get(i).getReadNegativeStrandFlag()==true)) { count += 1; } - else if (! (strand.equals("+") || strand.equals("-"))) { count += 1; } - } - } - return count; - } - - public static String Stats(ArrayList X) - { - Collections.sort(X); - - long count = 0; - long sum = 0; - long min = X.get(0); - long max = X.get(0); - long median = X.get(0); - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - if (x < min) { min = x; } - if (x > max) { max = x; } - sum += x; - count += 1; - if (i == X.size()/2) { median = x; } - } - - double mean = sum/count; - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - sum += (x-mean)*(x-mean); - count += 1; - } - double sd = Math.sqrt(sum/count); - - return String.format("%d %f %d %d %f", min, mean, median, max, sd); - } - } - - GenomeLoc highest_previously_done_loc = null; - private boolean in_skip_mask(GenomeLoc loc) - { - if (highest_previously_done_loc == null) { return false; } - if (highest_previously_done_loc.compareTo(loc) < 0) { return false; } - else { return true; } - } - - private void maybeInitializeDisoveryOutput() - { - if ( discovery_output_file == null ) - { - File file = new File(DISCOVERY_OUTPUT); - if ((APPEND == true) && (file.exists())) - { - try - { - Runtime.getRuntime().exec("cp -v " + DISCOVERY_OUTPUT + " " + DISCOVERY_OUTPUT + ".backup"); - - // 1. Read the existing file and record the highest site we've seen. - Scanner scanner = new Scanner(file); - while(scanner.hasNext()) - { - String line = scanner.nextLine(); - String[] tokens = line.split(" +"); - String loc_string = tokens[0]; - if (loc_string.equals("loc")) { continue; } - highest_previously_done_loc = GenomeLocParser.parseGenomeLoc(loc_string); - } - - // 2. Open the output file for appending. - discovery_output_file = new PrintStream(new FileOutputStream(DISCOVERY_OUTPUT, true)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - } - else - { - try - { - final String filename = DISCOVERY_OUTPUT; - discovery_output_file = new PrintStream(filename); - discovery_output_file.println(new MultiSampleCallResult().header()); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - } - } - } - - ///////// - // Walker Interface Functions - public void initialize() - { - System.out.printf("\n\n\n\n"); - (new ClassicGenotypeLikelihoods()).TEST(); - - try - { - maybeInitializeDisoveryOutput(); - - INCLUDE_GENOTYPES = INDIVIDUAL_OUTPUT != null; - if ( INCLUDE_GENOTYPES ) { - individual_output_file = new PrintStream(new GZIPOutputStream(new FileOutputStream(INDIVIDUAL_OUTPUT))); - individual_output_file.println("loc ref sample_name genotype lodVsNextBest lodVsRef in_dbsnp AA AC AG AT CC CG CT GG GT TT"); - } - - INCLUDE_STATS = STATS_OUTPUT != null; - if ( INCLUDE_STATS ) { - stats_output_file = new PrintStream(STATS_OUTPUT); - stats_output_file.println(DepthStats.Header()); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - GenomeAnalysisEngine toolkit = this.getToolkit(); - this.header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - sample_names = new ArrayList(); - - HashSet unique_sample_names = new HashSet(); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - String platform = (String)(read_groups.get(i).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - //System.out.printf("SAMPLE: %s %s\n", sample_name, platform); - - if (unique_sample_names.contains(sample_name)) { continue; } - unique_sample_names.add(sample_name); - sample_names.add(sample_name); - - System.out.printf("UNIQUE_SAMPLE: %s %s\n", sample_name, platform); - } - - // Load the confusion matrix if it exists - if (CONFUSION_MATRIX_FILE != null) - { - this.confusion_matrix = new ConfusionMatrix(CONFUSION_MATRIX_FILE); - } - - } - - - Date start_time = null; - int n_sites_processed = 0; - - public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - if (start_time == null) { start_time = new Date(); } - - if (in_skip_mask(context.getLocation()) == true) { return null; } - - if ((n_sites_processed % 1000) == 0) - { - Date current_time = new Date(); - long elapsed = current_time.getTime() - start_time.getTime(); - out.printf("RUNTIME: %d sites processed in %f seconds; %f seconds per site.\n", - n_sites_processed, - (double)elapsed/1000.0, - ((double)elapsed/1000.0)/(double)n_sites_processed); - } - n_sites_processed += 1; - - context = filter_each_read(context); - - if (ref.getBaseAsChar() == 'N') { return null; } - if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) == -1) { return null; } - if (context.getReads().size() <= 0) { return null; } - if (context.getReads().size() >= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold - - this.ref = ref.getBaseAsChar(); - MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBaseAsChar(), context, sample_names); - if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBaseAsChar(), context)); - return result; - } - - public void onTraversalDone(String sum) - { - discovery_output_file.flush(); - discovery_output_file.close(); - - if ( INCLUDE_STATS ) { - stats_output_file.flush(); - stats_output_file.close(); - } - - out.println("MultiSampleCaller done."); - return; - } - - public String reduceInit() - { - return null; - } - - public String reduce(MultiSampleCallResult record, String sum) - { - if (record != null) - { - discovery_output_file.printf(record.toString() + "\n"); - } - return null; - } - - // END Walker Interface Functions - ///////// - - - ///////// - // Calling Functions - - char ref; - protected ConfusionMatrix confusion_matrix; - - ClassicGenotypeLikelihoods reallyMakeGenotypeLikelihood(AlignmentContext context) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { - //System.out.printf("DBG: %s\n", context.getLocation()); - - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if (CONFUSION_MATRIX_FILE == null) - { - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - else - { - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); - } - } - - return G; - } - - HashMap glCache = new HashMap(); - int cache_size = 0; - - ClassicGenotypeLikelihoods GenotypeOld(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { - //ReadBackedPileup pileup = new ReadBackedPileup(ref, context); - //String bases = pileup.getBases(); - - List reads = context.getReads(); - List offsets = context.getOffsets(); - ref = Character.toUpperCase(ref); - - if (reads.size() == 0) { - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - return G; - } - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { - //System.out.printf("DBG: %s\n", context.getLocation()); - - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if (CONFUSION_MATRIX_FILE == null) - { - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - else - { - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); - } - } - G.ApplyPrior(ref, allele_likelihoods); - - // Handle indels - if (CALL_INDELS) - { - String[] indels = BasicPileup.indelPileup(reads, offsets); - IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); - if (indel_call.getType() != null) - { - G.addIndelLikelihood(indel_call); - } - else - { - G.addIndelLikelihood(null); - } - } - - return G; - } - - ClassicGenotypeLikelihoods Genotype(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { - return GenotypeCache(context, allele_likelihoods, indel_alt_freq ); - //return GenotypeOld(context, allele_likelihoods, indel_alt_freq ); - } - - - ClassicGenotypeLikelihoods GenotypeCache(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) - { - ref = Character.toUpperCase(ref); - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = null; - if ( context.getReads().size() == 0 ) { - G = new ClassicGenotypeLikelihoods(); - return G; - } else { - if ( true && glCache.containsKey(context) ) { - ClassicGenotypeLikelihoods cached = glCache.get(context); - G = (ClassicGenotypeLikelihoods)cached.clone(); - } else { - G = reallyMakeGenotypeLikelihood(context); - if (cache_size < 5000) - { - //System.out.printf("cache add (%d)\n", cache_size); - glCache.put(context, G.clone()); - cache_size += context.getReads().size(); - } - else - { - //System.out.printf("cache skip (%d)\n", cache_size); - } - } - G.ApplyPrior(ref, allele_likelihoods); - } - - return G; - } - - // This version is a little faster. - double[] CountFreqs(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] allele_likelihoods = new double[4]; - - for (int x = 0; x < genotype_likelihoods.length; x++) - { - ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; - - if (G.coverage == 0) { continue; } - - double[] personal_allele_likelihoods = new double[4]; - - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - double likelihood = Math.pow(10,G.likelihoods[k]); - personal_allele_likelihoods[i] += likelihood; - personal_allele_likelihoods[j] += likelihood; - k++; - } - } - double sum = 0; - for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } - for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } - for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } - } - - double sum = 0; - for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } - for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } - - return allele_likelihoods; - } - - - double CountIndelFreq(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - HashMap indel_allele_likelihoods = new HashMap(); - - double pRef = 0; - double pAlt = 0; - - for (int j = 0; j < sample_names.size(); j++) - { - double personal_pRef = 0; - double personal_pAlt = 0; - - IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); - personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); - personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); - - personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); - personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); - - pRef += personal_pRef; - pAlt += personal_pAlt; - } - - pRef = pRef / (pRef + pAlt); - pAlt = pAlt / (pRef + pAlt); - - return pAlt; - } - - // Potential precision error here. - double Compute_pD(ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] sample_weights) - { - double pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD += Math.log10(sample_weights[i] * sum); - } - return pD; - } - - double Compute_pNull(AlignmentContext[] contexts, double[] sample_weights) - { - double[] allele_likelihoods = new double[4]; - for (int i = 0; i < 4; i++) { allele_likelihoods[i] = 1e-6/3.0; } - allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); - } - return Compute_pD(G, sample_weights); - } - - double[] Compute_SampleWeights(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] pD = new double[sample_names.size()]; - double total_pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD[i] = sum; - total_pD += pD[i]; - } - - for (int i = 0; i < sample_names.size(); i++) - { - pD[i] /= total_pD; - } - - return pD; - } - - // Some globals to cache results. - EM_Result em_result; - double pD; - double pNull; - double lod; - double LOD(AlignmentContext[] contexts) - { - em_result = EM(contexts); - ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; - pD = Compute_pD(G, em_result.sample_weights); - pNull = Compute_pNull(contexts, em_result.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - double p0 = Math.log10(1 - pVar); - double pF; - - double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); - - if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } - else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } - - //System.out.printf("DBG %s %c %f %f %f %f (%.20f) %f ", contexts[0].getLocation(), ref, pD, pF, pNull, p0, Compute_alt_freq(ref, em_result.allele_likelihoods), 2.0 * em_result.EM_N); - //for (int i = 0; i < 4; i++) { System.out.printf("%f ", em_result.allele_likelihoods[i]); } - //System.out.printf("\n"); - - pD = pD + pF; - pNull = pNull + p0; - } - - lod = pD - pNull; - return lod; - } - - class EM_Result - { - String[] sample_names; - ClassicGenotypeLikelihoods[] genotype_likelihoods; - double[] allele_likelihoods; - double[] sample_weights; - int EM_N; - - public EM_Result(List sample_names, ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods, double[] sample_weights) - { - this.sample_names = new String[1]; - this.sample_names = sample_names.toArray(this.sample_names); - this.genotype_likelihoods = genotype_likelihoods; - this.allele_likelihoods = allele_likelihoods; - this.sample_weights = sample_weights; - - EM_N = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage > 0) { EM_N += 1; } - } - - } - - } - - final static double[] sample_weights = new double[1000]; - static { - for (int i = 0; i < 1000; i++) - { - //sample_weights[i] = 1.0/(double)i; - sample_weights[i] = 1.0; - } - } - - EM_Result EM(AlignmentContext[] contexts) - { - final boolean DEBUG_PRINT = false; - double[] allele_likelihoods = new double[4]; - - // These initial conditions should roughly replicate classic SSG. (at least on hets) - for (int i = 0; i < 4; i++) - { - if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) - else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) - } - double indel_alt_freq = 1e-4; - - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - //ClassicGenotypeLikelihoods[] Weighted_G = new ClassicGenotypeLikelihoods[sample_names.size()]; - - if ( DEBUG_PRINT ) System.out.printf("%n"); - - for (int i = 0; i < MAX_ITERATIONS; i++) - { - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); - //if (WEIGHT_SAMPLES) { G[j].ApplyWeight(sample_weights[j]); } - } - - double[] old_allele_likelihoods = allele_likelihoods; - allele_likelihoods = CountFreqs(G); - double alDelta = 0.0; - for (int j = 0; j < 4; j++) { alDelta += Math.abs(old_allele_likelihoods[j] - allele_likelihoods[j]); } - - if ( DEBUG_PRINT ) - { - System.out.printf("%s AL %f %f %f %f => delta=%e < %e == %b%n", - contexts[0].getLocation(), - Math.log10(allele_likelihoods[0]), Math.log10(allele_likelihoods[1]), Math.log10(allele_likelihoods[2]), Math.log10(allele_likelihoods[3]), - alDelta, ALLELE_FREQ_TOLERANCE, alDelta < ALLELE_FREQ_TOLERANCE); - } - - //if ( alDelta < ALLELE_FREQ_TOLERANCE ) { - // System.out.printf("Converged after %d iterations%n", i); - // break; - //} - -// if (CALL_INDELS) -// { -// indel_alt_freq = CountIndelFreq(G); -// } - -// if (WEIGHT_SAMPLES) -// { -// sample_weights = Compute_SampleWeights(G); -// } - } - - return new EM_Result(sample_names, G, allele_likelihoods, sample_weights); - } - - // Hacky global variables for debugging. - double StrandScore(AlignmentContext context) - { - //AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - - AlignmentContext fw = filterAlignmentContext(context, "+"); - AlignmentContext bw = filterAlignmentContext(context, "-"); - AlignmentContext[] contexts_fw = filterAlignmentContext(fw, sample_names, 0); - AlignmentContext[] contexts_bw = filterAlignmentContext(bw, sample_names, 0); - - EM_Result em_fw = EM(contexts_fw); - EM_Result em_bw = EM(contexts_bw); - - double pNull_fw = Compute_pNull(contexts_fw, em_fw.sample_weights); - double pNull_bw = Compute_pNull(contexts_bw, em_bw.sample_weights); - - double pD_fw = Compute_pD(em_fw.genotype_likelihoods, em_fw.sample_weights); - double pD_bw = Compute_pD(em_bw.genotype_likelihoods, em_bw.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - pD_fw = pD_fw + Math.log10(THETA/(2.0*em_fw.EM_N * Compute_alt_freq(ref, em_fw.allele_likelihoods))); - pNull_fw = pNull_fw + Math.log10(1 - pVar); - - pD_bw = pD_bw + Math.log10(THETA/(2.0*em_bw.EM_N * Compute_alt_freq(ref, em_bw.allele_likelihoods))); - pNull_bw = pNull_bw + Math.log10(1 - pVar); - } - - double EM_alt_freq_fw = Compute_alt_freq(ref, em_fw.allele_likelihoods); - double EM_alt_freq_bw = Compute_alt_freq(ref, em_bw.allele_likelihoods); - - //double pNull = Compute_pNull(contexts); - //double lod = LOD(contexts); - - double lod_fw = (pD_fw + pNull_bw) - pNull; - double lod_bw = (pD_bw + pNull_fw) - pNull; - double strand_score = Math.max(lod_fw - lod, lod_bw - lod); - return strand_score; - } - -// ClassicGenotypeLikelihoods HardyWeinberg(double[] allele_likelihoods) -// { -// ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); -// int k = 0; -// for (int i = 0; i < 4; i++) -// { -// for (int j = i; j < 4; j++) -// { -// G.likelihoods[k] = allele_likelihoods[i] * allele_likelihoods[j]; -// k++; -// } -// } -// return G; -// } - - char PickAlt(char ref, double[] allele_likelihoods) - { - Integer[] perm = MathUtils.sortPermutation(allele_likelihoods); - if (perm[3] != BaseUtils.simpleBaseToBaseIndex(ref)) { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[3]); } - else { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[2]); } - } - - double Compute_discovery_lod(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double pBest = 0; - double pRef = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - pBest += genotype_likelihoods[i].BestPosterior(); - pRef += genotype_likelihoods[i].RefPosterior(ref); - } - return pBest - pRef; - } - - // this one is a bit of a lazy hack. - double Compute_alt_freq(char ref, double[] allele_likelihoods) - { - return allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(PickAlt(ref, allele_likelihoods))]; - } - - int Compute_n_ref(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_het(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) != ref)) { n += 1; } - if ((g.charAt(0) != ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_hom(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) != ref) && (g.charAt(1) != ref)) { n += 1; } - } - return n; - } - - // This should actually return a GLF Record - MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List sample_names) - { - String in_dbsnp; - if (tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME).size() > 0) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; } - - AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - glCache.clear(); // reset the cache - cache_size = 0; - - double lod = LOD(contexts); - - int n_ref = Compute_n_ref(ref, em_result.genotype_likelihoods); - int n_het = Compute_n_het(ref, em_result.genotype_likelihoods); - int n_hom = Compute_n_hom(ref, em_result.genotype_likelihoods); - - //double strand_score = lod > MIN_LOD_FOR_STRAND ? StrandScore(context) : 0.0; - double strand_score; - if (n_het+n_hom > 0) { strand_score = StrandScore(context); } - else { strand_score = 0; } - - //EM_Result em_result = EM(contexts); - //ClassicGenotypeLikelihoods population_genotype_likelihoods = HardyWeinberg(em_result.allele_likelihoods); - - //double pD = Compute_pD(em_result.genotype_likelihoods); - //double pNull = Compute_pNull(contexts); - - //double discovery_lod = Compute_discovery_lod(ref, em_result.genotype_likelihoods); - double alt_freq = Compute_alt_freq(ref, em_result.allele_likelihoods); - - char alt = 'N'; - //if (lod > 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } - if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } - - if ( INCLUDE_GENOTYPES ) { - for (int i = 0; i < em_result.genotype_likelihoods.length; i++) - { - individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); - individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), - em_result.genotype_likelihoods[i].LodVsNextBest(), - em_result.genotype_likelihoods[i].LodVsRef(ref), - in_dbsnp); - - //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBaseCountsString()); - assert(em_result.genotype_likelihoods[i] != null); - em_result.genotype_likelihoods[i].sort(); - assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); - - if ( INCLUDE_GENOTYPES ) { - for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) - { - individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); - } - individual_output_file.printf("\n"); - } - } - } - - return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); - } - - // END Calling Functions - ///////// - - ///////// - // Utility Functions - - /// Filter a locus context by forward and backward - private AlignmentContext filterAlignmentContext(AlignmentContext context, String strand) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - - // Filter for strandedness - if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } - if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } - reads.add(read); - offsets.add(offset); - } - return new AlignmentContext(context.getLocation(), reads, offsets); - } - - // Filter a locus context by sample ID - protected AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) - { - HashMap index = new HashMap(); - for (int i = 0; i < sample_names.size(); i++) - { - index.put(sample_names.get(i), i); - } - - AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; - ArrayList[] reads = new ArrayList[sample_names.size()]; - ArrayList[] offsets = new ArrayList[sample_names.size()]; - - for (int i = 0; i < sample_names.size(); i++) - { - reads[i] = new ArrayList(); - offsets[i] = new ArrayList(); - } - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String sample = header.getReadGroup(RG).getSample(); - if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - reads[index.get(sample)].add(read); - offsets[index.get(sample)].add(offset); - } - - if (downsample != 0) - { - for (int j = 0; j < reads.length; j++) - { - List perm = new ArrayList(); - for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } - perm = MathUtils.randomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads[j].get(perm.get(i))); - downsampled_offsets.add(offsets[j].get(perm.get(i))); - } - - reads[j] = downsampled_reads; - offsets[j] = downsampled_offsets; - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - else - { - for (int j = 0; j < reads.length; j++) - { - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - - return contexts; - } - - private AlignmentContext filter_each_read(AlignmentContext L) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < L.getReads().size(); i++) - { - SAMRecord read = L.getReads().get(i); - Integer offset = L.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(this.header != null); - //assert(this.header.getReadGroup(RG) != null); - if (this.header.getReadGroup(RG) == null) { continue; } - - // skip bogus data - if (read.getMappingQuality() == 0) { continue; } - - String sample = this.header.getReadGroup(RG).getSample(); - //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - reads.add(read); - offsets.add(offset); - } - - AlignmentContext ans = new AlignmentContext(L.getLocation(), reads, offsets); - - return ans; - } - - // END Utility functions - ///////// - - - - -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCallerAccuracyTest.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCallerAccuracyTest.java deleted file mode 100644 index d171e5c04..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCallerAccuracyTest.java +++ /dev/null @@ -1,195 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.playground.indels.Matrix; - -import java.util.*; -import java.io.*; - -// Beta iterative multi-sample caller -// j.maguire 6-11-2009 - -public class MultiSampleCallerAccuracyTest extends MultiSampleCaller -{ - @Argument(required=false, shortName="lod_threshold", doc="") public double LOD_THRESHOLD = 1e-6; - @Argument(required=true, shortName="stats_output", doc="") public String STATS_OUTPUT; - - Matrix n_variants; - Matrix n_found; - - PrintStream stats_output; - - public void initialize() - { - this.DISCOVERY_OUTPUT = "/dev/null"; - this.INDIVIDUAL_OUTPUT = "/dev/null"; - - super.initialize(); - - n_variants = new Matrix(sample_names.size()*2, sample_names.size()*2); - n_found = new Matrix(sample_names.size()*2, sample_names.size()*2); - - for (int i = 0; i < sample_names.size()*2; i++) - { - for (int j = 0; j < sample_names.size()*2; j++) - { - n_variants.set(i,j,0); - n_found.set(i,j,0); - } - } - - try - { - stats_output = new PrintStream(STATS_OUTPUT); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - } - - public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - HapMapGenotypeROD hapmap = (HapMapGenotypeROD)tracker.lookup("hapmap", null); - - // Collect all the variants and the normals. - ArrayList variant_samples = new ArrayList(); - ArrayList reference_samples = new ArrayList(); - - int n_ref_chromosomes = 0; - int n_alt_chromosomes = 0; - - String reference_genotype = String.format("%c%c", ref, ref); - for (int i = 0; i < sample_names.size(); i++) - { - String true_genotype = hapmap.get(sample_names.get(i)); - if (true_genotype == null) { continue; } - - if (true_genotype.equals(reference_genotype)) { reference_samples.add(sample_names.get(i)); } - else { variant_samples.add(sample_names.get(i)); } - - if (true_genotype.equals(reference_genotype)) { n_ref_chromosomes += 1; } - else if (true_genotype.contains(String.format("%c",ref))) { n_ref_chromosomes += 1; n_alt_chromosomes += 1; } - else { n_alt_chromosomes += 2; } - - } - - // Put together a context. - ArrayList working_samples = new ArrayList(); - working_samples.addAll(variant_samples); - working_samples.addAll(reference_samples); - AlignmentContext working_context = filterAlignmentContextBySamples(context, working_samples); - - // Call. - MultiSampleCallResult call_result = super.map(tracker, ref, working_context); - EM_Result em_result = call_result.em_result; - - // Compute Statistics. - if (n_variants == null) { System.out.printf("n_variants is null\n"); } - if (n_found == null) { System.out.printf("n_found is null\n"); } - n_variants.set(n_ref_chromosomes, n_alt_chromosomes, n_variants.get(n_ref_chromosomes, n_alt_chromosomes)+1); - if ((call_result.lod > LOD_THRESHOLD) && (n_alt_chromosomes >= 1)) - { - n_found.set(n_ref_chromosomes, n_alt_chromosomes, n_found.get(n_ref_chromosomes, n_alt_chromosomes)+1); - } - - return null; - } - - private void PrintStats() - { - stats_output.printf("n_reference_chromosomes n_variant_chromosomes n_sites n_found fraction_found\n"); - for (int i = 0; i < sample_names.size()*2; i++) - { - for (int j = 0; j < sample_names.size()*2; j++) - { - int N = (int)n_variants.get(i,j); - int found = (int)n_found.get(i,j); - - if (N == 0) { continue; } - if (found == 0) { continue; } - - double fraction_found = 100.0 * (double)found / (double)N; - n_variants.set(i,j,0); - n_found.set(i,j,0); - stats_output.printf("%d %d %d %d %f\n", - i, - j, - N, - found, - fraction_found); - } - } - } - - public void onTraversalDone(String sum) - { - PrintStats(); - stats_output.flush(); - stats_output.close(); - out.println("MultiSampleCallerAccuracyTest done."); - return; - } - - public String reduceInit() - { - return super.reduceInit(); - } - - public String reduce(MultiSampleCallResult record, String sum) - { - return super.reduce(record, sum); - } - - // END Walker Interface Functions - ///////// - - - ///////// - // BEGIN Utility Functions - - // Filter a locus context by sample IDs - // (pulls out only reads from the specified samples, and returns them in one context). - private AlignmentContext filterAlignmentContextBySamples(AlignmentContext context, List sample_names) - { - HashSet index = new HashSet(); - for (int i = 0; i < sample_names.size(); i++) - { - index.add(sample_names.get(i)); - } - - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String sample = header.getReadGroup(RG).getSample(); - if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - if (index.contains(sample)) - { - reads.add(read); - offsets.add(offset); - } - } - - return new AlignmentContext(context.getLocation(), reads, offsets); - } - - // END Utility Functions - ///////// - -} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/PointEstimateGenotypeCalculationModel.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/PointEstimateGenotypeCalculationModel.java deleted file mode 100755 index ad9fcb917..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/PointEstimateGenotypeCalculationModel.java +++ /dev/null @@ -1,289 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.genotype.*; - -import java.util.*; - -public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculationModel { - - protected PointEstimateGenotypeCalculationModel() {} - - // the allele frequencies - private double[] alleleFrequencies = new double[4]; - private double[] oldAlleleFrequencies; - - // the GenotypeLikelihoods map - private HashMap GLs = new HashMap(); - - // Allele frequency initialization values from the original MSG code (so we can be consistent) - private static final double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity) - private static final double REF = 0.9994999; //sqrt(1-heterozygosity) - - - // overload this method so we can special-case the single sample - public VariantCallContext callLocus(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map contexts, DiploidGenotypePriors priors) { - - // we don't actually want to run EM for single samples - if ( samples.size() == 1 ) { - - // get the context for the sample - String sample = samples.iterator().next(); - StratifiedAlignmentContext sampleContext = contexts.get(sample); - - // if there were no good bases, the context wouldn't exist - if ( sampleContext == null ) - return null; - - // get the genotype likelihoods - Pair discoveryGL = getSingleSampleLikelihoods(sampleContext, priors, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - - // find the index of the best genotype - double[] normPosteriors = discoveryGL.second.getNormalizedPosteriors(); - Integer sortedNormPosteriors[] = Utils.SortPermutation(normPosteriors); - int bestIndex = sortedNormPosteriors[sortedNormPosteriors.length - 1]; - - // flag to determine if ref is the best call (not necessary in genotype mode) - boolean bestIsRef = false; - - // calculate the phred-scaled confidence score - double phredScaledConfidence; - if ( GENOTYPE_MODE ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(1.0 - normPosteriors[bestIndex]); - } else { - int refIndex = DiploidGenotype.createHomGenotype(ref).ordinal(); - bestIsRef = (refIndex == bestIndex); - double pError = (bestIsRef ? 1.0 - normPosteriors[refIndex] : normPosteriors[refIndex]); - phredScaledConfidence = QualityUtils.phredScaleErrorRate(pError); - } - - // are we above the lod threshold for emitting calls (and not in all-bases mode)? - if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) - return new VariantCallContext(phredScaledConfidence >= CONFIDENCE_THRESHOLD); - - // we can now create the genotype call object - GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, loc); - - // set the genotype and confidence - double[] posteriors = discoveryGL.second.getPosteriors(); - Integer sorted[] = Utils.SortPermutation(posteriors); - DiploidGenotype bestGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; - DiploidGenotype nextGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]]; - call.setNegLog10PError(posteriors[bestGenotype.ordinal()] - posteriors[nextGenotype.ordinal()]); - call.setGenotype(bestGenotype); - - if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setPileup(discoveryGL.first); - } - if ( call instanceof SampleBacked ) { - ((SampleBacked)call).setSampleName(sample); - } - if ( call instanceof LikelihoodsBacked ) { - ((LikelihoodsBacked)call).setLikelihoods(discoveryGL.second.getLikelihoods()); - } - if ( call instanceof PosteriorsBacked ) { - ((PosteriorsBacked)call).setPosteriors(posteriors); - } - - VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestIsRef ? Variation.VARIANT_TYPE.REFERENCE : Variation.VARIANT_TYPE.SNP); - if ( locusdata != null ) { - if ( locusdata instanceof ConfidenceBacked ) { - ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); - } - if ( locusdata instanceof IDBacked ) { - rodDbSNP dbsnp = getDbSNP(tracker); - if ( dbsnp != null ) - ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); - } - locusdata.setGenotypeCalls(Arrays.asList((Genotype)call)); - } - - call.setVariation(locusdata); - - return new VariantCallContext(locusdata, Arrays.asList((Genotype)call), phredScaledConfidence >= CONFIDENCE_THRESHOLD); - } - - return super.callLocus(tracker, ref, loc, contexts, priors); - } - - private Pair getSingleSampleLikelihoods(StratifiedAlignmentContext sampleContext, DiploidGenotypePriors priors, StratifiedAlignmentContext.StratifiedContextType contextType) { - // create the pileup - AlignmentContext myContext = sampleContext.getContext(contextType); - ReadBackedPileup pileup = myContext.getBasePileup(); - - // create the GenotypeLikelihoods object - GenotypeLikelihoods GL = new GenotypeLikelihoods(baseModel, priors, defaultPlatform); - GL.add(pileup, true); - return new Pair(pileup, GL); - } - - protected void initializeAlleleFrequencies(int numSamplesInContext, char ref) { - for (int i = 0; i < 4; i++) - alleleFrequencies[i] = NON_REF; - alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF; - - for (int i = 0; i < 4; i++) - logger.debug("Initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]); - } - - protected void initializeGenotypeLikelihoods(char ref, Map contexts, DiploidGenotypePriors priors, StratifiedAlignmentContext.StratifiedContextType contextType) { - GLs.clear(); - - DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); - - for ( String sample : contexts.keySet() ) { - StratifiedAlignmentContext context = contexts.get(sample); - ReadBackedPileup pileup = context.getContext(contextType).getBasePileup(); - - // create the GenotypeLikelihoods object - GenotypeLikelihoods GL = new GenotypeLikelihoods(baseModel, AFPriors, defaultPlatform); - GL.add(pileup, true); - - GLs.put(sample, GL); - } - } - - private static DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFreqs) { - // convert to log-space - double[] log10Freqs = new double[4]; - for (int i = 0; i < 4; i++) - log10Freqs[i] = Math.log10(alleleFreqs[i]); - - double[] alleleFreqPriors = new double[10]; - - // this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq) - for ( DiploidGenotype g : DiploidGenotype.values() ) { - alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)]; - // add a factor of 2 for the 2pq case - if ( g.isHet() ) - alleleFreqPriors[g.ordinal()] += Math.log10(2); - } - - return new DiploidGenotypePriors(alleleFreqPriors); - } - - protected void calculateAlleleFrequencyPosteriors() { - // initialization - oldAlleleFrequencies = alleleFrequencies.clone(); - for (int i = 0; i < 4; i++) - alleleFrequencies[i] = 0.0; - - for ( GenotypeLikelihoods GL : GLs.values() ) { - double[] normalizedPosteriors = GL.getNormalizedPosteriors(); - - // calculate the posterior weighted frequencies for this sample - double[] personalAllelePosteriors = new double[4]; - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double posterior = normalizedPosteriors[g.ordinal()] / 2.0; // each base gets half the probability - personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior; - personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior; - } - - for (int i = 0; i < 4; i++) - alleleFrequencies[i] += personalAllelePosteriors[i]; - } - - // normalize - double sum = 0.0; - for (int i = 0; i < 4; i++) - sum += alleleFrequencies[i]; - for (int i = 0; i < 4; i++) - alleleFrequencies[i] /= sum; - - for (int i = 0; i < 4; i++) - logger.debug("New allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]); - } - - protected void applyAlleleFrequencyToGenotypeLikelihoods() { - DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); - for ( GenotypeLikelihoods GL : GLs.values() ) - GL.setPriors(AFPriors); - } - - protected boolean isStable() { - // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC - double AF_delta = 0.0; - for (int i = 0; i < 4; i++) - AF_delta += Math.abs(oldAlleleFrequencies[i] - alleleFrequencies[i]); - - return (AF_delta < EM_STABILITY_METRIC); - } - - protected EMOutput computePofF(char ref) { - // some debugging output - for ( String sample : GLs.keySet() ) - logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString()); - - // compute pD and pNull without allele frequencies - double pD = compute_pD(GLs); - double pNull = compute_pNull(ref, GLs); - logger.debug("Original pD=" + pD + ", pNull=" + pNull); - - // compute p0 - double pVar = 0.0; - for (int i = 1; i < GLs.size(); i++) - pVar += heterozygosity/(double)i; - double p0 = Math.log10(1.0 - pVar); - - // compute actual priors: theta / MAF - double MAF; - Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies); - if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) ) - MAF = alleleFrequencies[sortedIndexes[3]]; - else - MAF = alleleFrequencies[sortedIndexes[2]]; - - // compute pF - double pF; - double expectedChromosomes = 2.0 * (double)GLs.size() * MAF; - if ( expectedChromosomes < 1.0 ) - pF = p0; - else - pF = Math.log10(heterozygosity / expectedChromosomes); - logger.debug("p0=" + p0 + ", pF=" + pF); - - pD += pF; - pNull += p0; - logger.debug("Final pD=" + pD + ", pNull=" + pNull); - - return new EMOutput(pD, pNull, pF, MAF, GLs); - } - - private static double compute_pD(HashMap GLs) { - double pD = 0.0; - for ( GenotypeLikelihoods GL : GLs.values() ) { - double sum = 0.0; - for ( DiploidGenotype g : DiploidGenotype.values() ) { - sum += Math.pow(10, GL.getPosterior(g)); - } - pD += Math.log10(sum); - } - return pD; - } - - private static double compute_pNull(char ref, HashMap GLs) { - // compute null likelihoods - double[] alleleLikelihoods = new double[4]; - for (int i = 0; i < 4; i++) - alleleLikelihoods[i] = 1e-6/3.0; - alleleLikelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; - DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleLikelihoods); - - HashMap GL_null = new HashMap(); - try { - for ( String sample : GLs.keySet() ) { - GenotypeLikelihoods GL = (GenotypeLikelihoods)GLs.get(sample).clone(); - GL.setPriors(AFPriors); - GL_null.put(sample, GL); - } - } catch (CloneNotSupportedException e) { - throw new StingException("Clone() not supported for given GenotypeLikelihoods subclass?"); - } - - return compute_pD(GL_null); - } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/PrintHapmapGenotypes.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/PrintHapmapGenotypes.java deleted file mode 100644 index cf89cca51..000000000 --- a/archive/java/src/org/broadinstitute/sting/multisamplecaller/PrintHapmapGenotypes.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.*; - -// Sanity check to test HapmapGenotypeROD -// Compute %dbsnp and transition/transversion rate. - -@By(DataSource.REFERENCE) -@Requires(DataSource.REFERENCE) -@Allows(DataSource.REFERENCE) -public class PrintHapmapGenotypes extends RefWalker -{ - //@Argument(required=false, shortName="n_frequency_bins", doc="") public int n_frequency_bins = 20; - - public void initialize() - { - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - // Iterate over each analysis, and update it - //rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbsnp", null); - HapMapGenotypeROD A = (HapMapGenotypeROD)tracker.lookup("A", null); - - if (A != null) - { - GenomeLoc loc = A.getLocation(); - String[] sample_ids = A.getSampleIDs(); - String[] genotypes = A.getGenotypes(); - - for (int i = 0; i < sample_ids.length; i++) - { - out.printf("%s %s %s\n", loc, sample_ids[i], genotypes[i]); - } - out.printf("\n"); - } - - return 1; - } - - // Given result of map function - public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) - { - return treeReduce(sum,value); - } - public Integer treeReduce(Integer lhs, Integer rhs) - { - return lhs + rhs; - } - - public void onTraversalDone(Integer result) - { - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldepthofcoverage/DepthOfCoverageWalker.java b/archive/java/src/org/broadinstitute/sting/oldepthofcoverage/DepthOfCoverageWalker.java deleted file mode 100755 index 3abb9669e..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldepthofcoverage/DepthOfCoverageWalker.java +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.*; -import net.sf.samtools.SAMReadGroupRecord; - -import java.util.*; - -/** - * Computes the depth of coverage at all loci in the specified region of the reference. Includes features to control - * grouping of results by read group or by sample, or filtering loci with either a small number of overlapping alignments - * or with alignments of poor mapping quality. Can optionally include individual base counts at each locus. - */ -@By(DataSource.REFERENCE) -public class DepthOfCoverageWalker extends LocusWalker { - - @Argument(fullName="suppressLocusPrinting", shortName= "noLocus", doc="Suppress printing", required=false) - public boolean suppressLocusPrinting = false; - - @Argument(fullName="suppressIntervalPrinting", shortName= "noInterval", doc="Suppress printing", required=false) - public boolean suppressIntervalPrinting = false; - - @Argument(fullName="printBaseCounts", shortName ="bases", doc="Print individual base counts (A,C,G,T only)", required=false) - protected boolean printBaseCounts = false; - - @Argument(fullName="minMAPQ", shortName ="minMAPQ", doc="If provided, we will also list read counts with MAPQ >= this value at a locus in coverage",required=false) - protected int excludeMAPQBelowThis = -1; - - @Argument(fullName = "minBaseQualityScore", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public Integer minBaseQ = -1; - - @Argument(fullName="minDepth", shortName ="minDepth", doc="If provided, we will also list the percentage of loci with depth >= this value per interval",required=false) - protected int minDepthForPercentage = -1; - - @Argument(fullName="byReadGroup", shortName="byRG", doc="List read depths for each read group") - protected boolean byReadGroup = false; - - @Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample") - protected boolean bySample = false; - - @Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage") - protected boolean printHistogram = false; - - - // keep track of the read group and sample names - private TreeSet readGroupNames = new TreeSet(); - private TreeSet sampleNames = new TreeSet(); - - // keep track of the histogram data - private ExpandingArrayList coverageHist = null; - private long maxDepth = 0; - private long totalLoci = 0; - - // we want to see reads with deletions - public boolean includeReadsWithDeletionAtLoci() { return true; } - - public void initialize() { - - // initialize histogram array - if ( printHistogram ) { - coverageHist = new ExpandingArrayList(); - } - - // initialize read group names from BAM header - if ( byReadGroup ) { - List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); - for ( SAMReadGroupRecord record : readGroups ) - readGroupNames.add(record.getReadGroupId()); - } - - // initialize sample names from BAM header - if ( bySample ) { - List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); - for ( SAMReadGroupRecord record : readGroups ) { - String sample = record.getSample(); - if ( sample != null ) - sampleNames.add(sample); - } - } - - // build and print the per-locus header - if ( !suppressLocusPrinting ) { - out.println("\nPER_LOCUS_COVERAGE_SECTION"); - printHeaderLine(false); - } - } - - public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - // fill in and print all of the per-locus coverage data, then return it to reduce - - ReadBackedPileup pileup = context.getPileup().getBaseFilteredPileup(minBaseQ); - - DoCInfo info = new DoCInfo(); - info.totalCoverage = pileup.size(); - - long nBadMAPQReads = 0, nDeletionReads = 0; - for ( PileupElement p : pileup ) { - - if ( excludeMAPQBelowThis > 0 && p.getRead().getMappingQuality() < excludeMAPQBelowThis ) - nBadMAPQReads++; - else if ( p.isDeletion() ) - nDeletionReads++; - - if ( printBaseCounts ) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(p.getBase()); - if ( baseIndex != -1 ) - info.baseCounts[baseIndex]++; - } - - SAMReadGroupRecord readGroup = p.getRead().getReadGroup(); - if ( readGroup == null ) - continue; - - if ( byReadGroup ) { - String readGroupName = readGroup.getReadGroupId(); - long oldDepth = info.depthByReadGroup.get(readGroupName); - info.depthByReadGroup.put(readGroupName, oldDepth + 1); - } - - if ( bySample ) { - String sample = readGroup.getSample(); - if ( sample != null ) { - long oldDepth = info.depthBySample.get(sample); - info.depthBySample.put(sample, oldDepth + 1); - } - } - } - - info.numDeletions = nDeletionReads; - if ( excludeMAPQBelowThis > 0 ) - info.numBadMQReads = nBadMAPQReads; - - // if we need to print the histogram, fill in the data - if ( printHistogram ) - incCov(info.totalCoverage); - - if ( !suppressLocusPrinting ) - printDoCInfo(context.getLocation(), info, false); - - return info; - } - - public boolean isReduceByInterval() { - return true; - } - - public DoCInfo reduceInit() { return new DoCInfo(); } - - public DoCInfo reduce(DoCInfo value, DoCInfo sum) { - - // combine all of the per-locus data for a given interval - - sum.totalCoverage += value.totalCoverage; - sum.numDeletions += value.numDeletions; - sum.numBadMQReads += value.numBadMQReads; - if ( value.totalCoverage >= minDepthForPercentage ) { - sum.minDepthCoveredLoci++; - } - if ( printBaseCounts ) { - for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ ) - sum.baseCounts[baseIndex] += value.baseCounts[baseIndex]; - } - if ( byReadGroup ) { - for ( String rg : readGroupNames ) { - long oldDepth = sum.depthByReadGroup.get(rg); - sum.depthByReadGroup.put(rg, oldDepth + value.depthByReadGroup.get(rg)); - } - } - if ( bySample ) { - for ( String sample : sampleNames ) { - long oldDepth = sum.depthBySample.get(sample); - sum.depthBySample.put(sample, oldDepth + value.depthBySample.get(sample)); - } - } - - return sum; - } - - @Override - public void onTraversalDone(List> results) { - - // build and print the per-interval header - if ( ! suppressIntervalPrinting ) { - out.println("\n\nPER_INTERVAL_COVERAGE_SECTION"); - printHeaderLine(true); - - // print all of the individual per-interval coverage data - for ( Pair result : results ) - printDoCInfo(result.first, result.second, true); - } - - // if we need to print the histogram, do so now - if ( printHistogram ) - printHisto(); - } - - private void printHeaderLine(boolean printAverageCoverage) { - StringBuilder header = new StringBuilder("location\ttotal_coverage"); - if ( printAverageCoverage ) - header.append("\taverage_coverage"); - header.append("\tcoverage_without_deletions"); - if ( printAverageCoverage ) - header.append("\taverage_coverage_without_deletions"); - if ( excludeMAPQBelowThis > 0 ) { - header.append("\tcoverage_atleast_MQ"); - header.append(excludeMAPQBelowThis); - if ( printAverageCoverage ) { - header.append("\taverage_coverage_atleast_MQ"); - header.append(excludeMAPQBelowThis); - } - } - if ( printAverageCoverage && minDepthForPercentage >= 0 ) { - header.append("\tpercent_loci_covered_atleast_depth"); - header.append(minDepthForPercentage); - } - if ( printBaseCounts ) { - header.append("\tA_count\tC_count\tG_count\tT_count"); - } - if ( byReadGroup ) { - for ( String rg : readGroupNames ) { - header.append("\tcoverage_for_"); - header.append(rg); - } - } - if ( bySample ) { - for ( String sample : sampleNames ) { - header.append("\tcoverage_for_"); - header.append(sample); - } - } - out.println(header.toString()); - } - - private void incCov(long depth) { - long c = coverageHist.expandingGet((int)depth, 0L); - coverageHist.set((int)depth, c + 1); - if ( depth > maxDepth ) - maxDepth = depth; - totalLoci++; - } - - private long getCov(long depth) { - return coverageHist.get((int)depth); - } - - private void printHisto() { - - // sanity check - if ( totalLoci == 0 ) - return; - - // Code for calculting std devs adapted from Michael Melgar's python script - - // Find the maximum extent of 'good' data - // First, find the mode - long maxValue = getCov(1); // ignore doc=0 - int mode = 1; - for (int i = 2; i <= maxDepth; i++) { - if ( getCov(i) > maxValue ) { - maxValue = getCov(i); - mode = i; - } - } - - // now, procede to find end of good Gaussian fit - long dist = (long)Math.pow(10, 9); - while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth ) - dist = Math.abs(getCov(mode++) - getCov(1)); - long maxGoodDepth = Math.min(mode + 1, maxDepth); - - // calculate the mean of the good region - long totalGoodSites = 0, totalGoodDepth = 0; - for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - totalGoodSites += getCov(i); - totalGoodDepth += i * getCov(i); - } - double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites; - - // calculate the variance and standard deviation of the good region - double var = 0.0; - for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2); - } - double stdev = Math.sqrt(var / (double)totalGoodSites); - - // print - out.println("\n\nHISTOGRAM_SECTION"); - out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev); - - for (int i = 1; i <= 5; i++) - out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev)); - - out.println("\ndepth count freq(percent)"); - for (int i = 0; i <= maxDepth; i++) - out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci); - } - - private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) { - - double totalBases = (double)(loc.getStop() - loc.getStart() + 1); - - StringBuilder sb = new StringBuilder(); - sb.append(loc); - sb.append("\t"); - sb.append(info.totalCoverage); - sb.append("\t"); - if ( printAverageCoverage ) { - sb.append(String.format("%.2f", ((double)info.totalCoverage) / totalBases)); - sb.append("\t"); - } - sb.append((info.totalCoverage - info.numDeletions)); - - if ( printAverageCoverage ) { - sb.append("\t"); - sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numDeletions)) / totalBases)); - } - - if ( excludeMAPQBelowThis > 0 ) { - sb.append("\t"); - sb.append((info.totalCoverage - info.numBadMQReads)); - if ( printAverageCoverage ) { - sb.append("\t"); - sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numBadMQReads)) / totalBases)); - } - } - - if ( printAverageCoverage && minDepthForPercentage >= 0 ) { - sb.append("\t"); - sb.append(String.format("%.2f", ((double)info.minDepthCoveredLoci) / totalBases)); - } - - if ( printBaseCounts ) { - for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ ) { - sb.append("\t"); - sb.append(String.format("%8d", info.baseCounts[baseIndex])); - } - } - - if ( byReadGroup ) { - for ( String rg : readGroupNames ) { - sb.append("\t"); - sb.append(String.format("%8d", info.depthByReadGroup.get(rg))); - } - } - - if ( bySample ) { - for ( String sample : sampleNames ) { - sb.append("\t"); - sb.append(String.format("%8d", info.depthBySample.get(sample))); - } - } - - out.println(sb.toString()); - } - - public class DoCInfo { - public long totalCoverage = 0; - public long numDeletions = 0; - public long numBadMQReads = 0; - public long minDepthCoveredLoci = 0; - - public long[] baseCounts = null; - - public HashMap depthByReadGroup = null; - public HashMap depthBySample = null; - - public DoCInfo() { - if ( printBaseCounts ) { - baseCounts = new long[4]; - } - if ( byReadGroup ) { - depthByReadGroup = new HashMap(); - for ( String readGroupName : readGroupNames ) - depthByReadGroup.put(readGroupName, 0L); - } - if ( bySample ) { - depthBySample = new HashMap(); - for ( String sample : sampleNames ) - depthBySample.put(sample, 0L); - } - } - - @Override - public String toString() { - // This is an executive summary, included mainly so that integration tests will pass. - // TODO: Add a more compelling summary. - return String.format("Summary: total coverage = %d; # of deletions = %d; # of bad mapping quality reads = %d; minimum covered depth =%d", - totalCoverage, - numDeletions, - numBadMQReads, - minDepthCoveredLoci); - } - - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java b/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java deleted file mode 100644 index d8d263b9e..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java +++ /dev/null @@ -1,412 +0,0 @@ -package org.broadinstitute.sting.utils; - -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.picard.reference.ReferenceSequence; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.pileup.*; - - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 25, 2009 - * Time: 12:15:38 AM - * To change this template use File | Settings | File Templates. - */ -public class AlignmentUtils { - - - /** Returns number of mismatches in the alignment r to the reference sequence - * refSeq. It is assumed that - * the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches - * are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do - * not contribute to the error count returned by this method. - * @param r aligned read - * @param refSeq reference sequence - * @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped) - */ - public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) { - byte[] ref = refSeq.getBases(); - if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = r.getAlignmentStart()-1; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches - Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { - case M: - for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - char refChr = (char)ref[i_ref]; - char readChr = (char)r.getReadBases()[i_read]; - if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mm_count++; - } - break; - case I: - case S: - i_read += ce.getLength(); - break; - case D: - case N: - i_ref += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - - } - return mm_count; - } - - /** - * mhanna - 11 May 2009 - stubbed out competing method that works with partial references. - * Computes number of mismatches in the read alignment to the refence ref - * specified in the record r. Indels are completely ignored by this method: - * only base mismatches in the alignment segments where both sequences are present are counted. - * @param r - * @return - */ - public static int numMismatches(SAMRecord r, char[] ref) { - if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = 0; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches - Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { - case M: - for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - char refChr = ref[i_ref]; - char readChr = (char)r.getReadBases()[i_read]; - if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mm_count++; - } - break; - case I: - case S: - i_read += ce.getLength(); - break; - case D: - case N: - i_ref += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator()); - } - - } - return mm_count; - } - - // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED - // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO - // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY... - /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality - * for reference sequence specified as conventional java string (of bases). By default, it is assumed that - * the alignment starts at (1-based) position r.getAlignmentStart() on the reference refSeq. - * See {@link #numMismatches(SAMRecord, String, int)} if this is not the case. - */ - public static int numMismatches(SAMRecord r, String refSeq ) { - if ( r.getReadUnmappedFlag() ) return 1000000; - return numMismatches(r, refSeq, r.getAlignmentStart()-1); - } - - /** Returns number of mismatches in the alignment r to the reference sequence - * refSeq assuming the alignment starts at (ZERO-based) position refIndex on the - * specified reference sequence; in other words, refIndex is used in place of alignment's own - * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r - * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected. - * - * @param r alignment - * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of - * the reference string, IndexOutOfBound exception will be thrown at runtime). - * @param refIndex zero-based position, at which the alignment starts on the specified reference string. - * @return the number of mismatches - */ - public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { - int readIdx = 0; - int mismatches = 0; - byte[] readSeq = r.getReadBases(); - Cigar c = r.getCigar(); - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - switch ( ce.getOperator() ) { - case M: - for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex >= refSeq.length() ) - continue; - char refChr = refSeq.charAt(refIndex); - char readChr = (char)readSeq[readIdx]; - // Note: we need to count X/N's as mismatches because that's what SAM requires - //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - // continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mismatches++; - } - break; - case I: - case S: - readIdx += ce.getLength(); - break; - case D: - case N: - refIndex += ce.getLength(); - break; - default: throw new StingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - return mismatches; - } - - /** Returns the number of mismatches in the pileup within the given reference context. - * - * @param pileup the pileup with reads - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { - int mismatches = 0; - for ( PileupElement p : pileup ) - mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); - return mismatches; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { - - int mismatches = 0; - - int windowStart = (int)ref.getWindow().getStart(); - int windowStop = (int)ref.getWindow().getStop(); - char[] refBases = ref.getBases(); - byte[] readBases = p.getRead().getReadBases(); - Cigar c = p.getRead().getCigar(); - - int readIndex = 0; - int currentPos = p.getRead().getAlignmentStart(); - int refIndex = Math.max(0, currentPos - windowStart); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { - // are we past the ref window? - if ( currentPos > windowStop ) - break; - - // are we before the ref window? - if ( currentPos < windowStart ) - continue; - - char refChr = refBases[refIndex++]; - - // do we need to skip the target site? - if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) - continue; - - char readChr = (char)readBases[readIndex]; - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mismatches++; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - currentPos += cigarElementLength; - if ( currentPos > windowStart ) - refIndex += Math.min(cigarElementLength, currentPos - windowStart); - break; - default: - // fail silently - return 0; - } - - } - - return mismatches; - } - - /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. - * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but - * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is - * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. - * Formally, this method simply returns the number of M elements in the cigar. - * @param r alignment - * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). - */ - public static int getNumAlignmentBlocks(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) n++; - } - - return n; - } - - public static String toString(Cigar cig) { - StringBuilder b = new StringBuilder(); - - for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { - char c='?'; - switch ( cig.getCigarElement(i).getOperator() ) { - case M : c = 'M'; break; - case D : c = 'D'; break; - case I : c = 'I'; break; - } - b.append(cig.getCigarElement(i).getLength()); - b.append(c); - } - return b.toString(); - } - - - public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef ) { - return alignmentToString( cigar, seq, ref, posOnRef, 0 ); - } - - public static String cigarToString(Cigar cig) { - if ( cig == null ) - return "null"; - - StringBuilder b = new StringBuilder(); - - for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { - char c='?'; - switch ( cig.getCigarElement(i).getOperator() ) { - case M : c = 'M'; break; - case D : c = 'D'; break; - case I : c = 'I'; break; - } - b.append(cig.getCigarElement(i).getLength()); - b.append(c); - } - return b.toString(); - } - - public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef, final int posOnRead ) { - int readPos = posOnRead; - int refPos = posOnRef; - - StringBuilder refLine = new StringBuilder(); - StringBuilder readLine = new StringBuilder(); - - for ( int i = 0 ; i < posOnRead ; i++ ) { - refLine.append( ref.charAt( refPos - readPos + i ) ); - readLine.append( seq.charAt(i) ) ; - } - - for ( int i = 0 ; i < cigar.numCigarElements() ; i++ ) { - - final CigarElement ce = cigar.getCigarElement(i); - - switch(ce.getOperator()) { - case I: - for ( int j = 0 ; j < ce.getLength(); j++ ) { - refLine.append('+'); - readLine.append( seq.charAt( readPos++ ) ); - } - break; - case D: - for ( int j = 0 ; j < ce.getLength(); j++ ) { - readLine.append('*'); - refLine.append( ref.charAt( refPos++ ) ); - } - break; - case M: - for ( int j = 0 ; j < ce.getLength(); j++ ) { - refLine.append(ref.charAt( refPos++ ) ); - readLine.append( seq.charAt( readPos++ ) ); - } - break; - default: throw new StingException("Unsupported cigar operator: "+ce.getOperator() ); - } - } - refLine.append('\n'); - refLine.append(readLine); - refLine.append('\n'); - return refLine.toString(); - } - - public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) { - - final char[] alignment = new char[read.length]; - int refPos = 0; - int alignPos = 0; - - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - - switch( ce.getOperator() ) { - case I: - case S: - for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos++; - break; - case M: - for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - default: - throw new StingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and - * alignment reference index/start. - * @param r - * @return - */ - public static boolean isReadUnmapped(final SAMRecord r) { - if ( r.getReadUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers alltogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getReferenceName() != null && r.getReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME ) - && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/ConsensusSequence.java b/archive/java/src/org/broadinstitute/sting/oldindels/ConsensusSequence.java deleted file mode 100644 index 518d4bd71..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/ConsensusSequence.java +++ /dev/null @@ -1,211 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import org.broadinstitute.sting.utils.Pair; - -import java.util.List; -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 21, 2009 - * Time: 4:25:12 PM - * To change this template use File | Settings | File Templates. - */ -public class ConsensusSequence { - private List< int[] > coverage; // counts of observations of every nucleotide at each position - private long referencePos;// (arbitrary) reference position; when adding sequences, their offsets should be wrt this position - private int startOffset; // offset of the leftmost base of this consensus sequence wrt referencePos; negative=left, positive=right - private final static int NUMBINS = 4; - private final static char[] BASES = { 'A','C','G','T' }; - - public ConsensusSequence(int refPos) { - coverage = new ArrayList< int[] >(); - referencePos = refPos; - startOffset = 0; - } - - public ConsensusSequence() { - this(0); - } - - /** Adds sequence se to the consensus, in the sense that all bases of seq are counted - * into observed coverage kept by the consensus. The position of the sequence is specified by the - * offset with respect to the fixed reference position of the consensus (the latter does not - * have to be consensus start), and if the sequence extends beyound the consensus on either end, the - * consensus will be extended appropriately to accomodate the full sequence. - * @param seq nucleotide sequence ('ACGT...') - * @param offset position of the start of the sequence relative to the fixed reference position of the consensus - */ - public void addSequence(String seq, int offset) { - // if sequence starts before than the currently held consensus oes, extend consensus to the left - if ( offset < startOffset ) { - coverage.addAll(0,instantiateCoverageList(startOffset-offset)); - startOffset = offset; - } - // if the sequence ends beyound the currently held consensus, extend consensus to the right - if ( offset + seq.length() > startOffset + coverage.size() ) { - coverage.addAll( instantiateCoverageList(offset+seq.length() - startOffset - coverage.size()) ); - } - - // count bases from the sequence into the coverage - int posOnConsensus = offset - startOffset; - for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { - char base = Character.toUpperCase(seq.charAt(i)); - if ( base == 'N') continue; - coverage.get(posOnConsensus)[baseToInt(base)]++; - } - } - - /** Removes sequence seq from the consensus. More exactly, 1 will be subtracted from current - * observation counts kept by the consensus for each observed base at every position of the sequence. The - * position of the sequence is specified by the offset with respect to the reference position - * of the consensus. NOTE: this method is unchecked and does not verify that the sequence being subtracted - * was indeed previously added to the consensus and/or that the consenus does accomodate full length of - * the sequence. If it is not the case, the results can be unpredictable or assert failure may occur. - * - * @param seq nucleotide sequence ('ACGT...') - * @param offset position of the start of the sequence relative to the fixed reference position of the consensus - */ - public void removeSequence(String seq, int offset) { - assert offset >= startOffset : - "Attempt to remove from consensus a sequence that starts prior to consenus start"; - assert (offset+seq.length() < startOffset + coverage.size()) : - "Attempt to remove from consensus a sequence that extends beyond consensus end"; - // subtract sequence bases from the coverage - int posOnConsensus = offset - startOffset; - for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { - char base = Character.toUpperCase(seq.charAt(i)); - if ( base == 'N') continue; - coverage.get(posOnConsensus)[ baseToInt(base) ]--; - } - } - - /** Returns offset of the start of consensus sequence with respect to the reference position the - * consensus is pinned to. - * @return - */ - public int getStartOffset() { return startOffset; } - - /** Returns the length (number of bases) of the consensus sequence. - * - * @return - */ - public int length() { return coverage.size(); } - - /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to - * the specified sequence seq starting at position offset wrt consenus reference position. - * @param seq - * @param offset - * @return - */ - public double distance(String seq, int offset) { - int posOnConsensus; // index into the currently held consensus sequence - int i ; // index into the passed sequence argument - if ( offset < startOffset ) { - posOnConsensus = 0; - i = startOffset - offset; - } else { - i = 0 ; - posOnConsensus = offset - startOffset; - } - // stop position on the passed sequence (can be less than sequence length if consensus stops prematurely) - int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset; - - for ( ; i < stop ; i++, posOnConsensus++ ) { - int base = baseToInt(Character.toUpperCase(seq.charAt(posOnConsensus))); - int [] cov = coverage.get(posOnConsensus); - int totalcov = cov[0]+cov[1]+cov[2]+cov[3]; - - } - return 0.0; - } - - /** Returns consensus base at the specified offset wrt the consesus sequence's reference position. - * Specified offset must be within the span of currently held consensus sequence. Consensus base is the - * one with the maximum count of observations. If two different nucleotides were observed exactly the - * same number of times (and that number is greater than the number of observations for othe nucleotides), - * the "lesser" one, (order being ACGT) will be returned. If coverage at specified position is zero, 'N' will - * be returned. - * @param offset - * @return - */ - public char baseAt(int offset) { - assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; - int [] cov = coverage.get(offset-startOffset); - int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; - int bmax = 0; - char base = 'N'; - for ( int z = 0; z < 4 ; z++ ) { - if ( cov[z] > bmax ) { - bmax = cov[z]; - base = BASES[z]; - } - } - return base; - } - - /** Returns consensus base at the specified offset together with its observation count. - * - * @param offset - * @return - * @see #baseAt(int) - */ - public Pair baseWithCountAt(int offset) { - assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; - int [] cov = coverage.get(offset-startOffset); - int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; - int bmax = 0; - char base = 'N'; - for ( int z = 0; z < 4 ; z++ ) { - if ( cov[z] > bmax ) { - bmax = cov[z]; - base = BASES[z]; - } - } - return new Pair(base,bmax); - } - - /** Returns total coverage (all observations regardless of what base what observed) at position - * specified by offset with respect to the conensus' reference position. offset does not have to be within - * the bounds of the currently kept consensus sequence, if it falls outside, a 0 will be silently returned. - * @param offset - * @return - */ - public int coverageAt(int offset) { - if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0; - int [] cov = coverage.get(offset-startOffset); - return cov[0]+cov[1]+cov[2]+cov[3]; - } - - /** Returns consesus sequence as a astring of bases (ACGTN); N will be returned for positions with zero - * coverage. - * @return - */ - public String getSequence() { - char [] b = new char[coverage.size()]; - for ( int i = 0 ; i < b.length ; i++ ) { - b[i] = baseAt(i+startOffset); - } - return new String(b); - } - - private List instantiateCoverageList(int n) { - List< int[] > subseq = new ArrayList(n); - for ( int i = 0 ; i < n ; i++ ) subseq.add(new int[NUMBINS]); - return subseq; - } - - private int baseToInt(char c) { - int base; - switch( Character.toUpperCase(c) ) { - case 'A': base = 0; break; - case 'C': base = 1; break; - case 'G': base = 2; break; - case 'T': base = 3; break; - case 'N': base = -1; break; - default : throw new IllegalArgumentException("Sequence can contain only ACGTN symbols"); - } - return base; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/CountedObject.java b/archive/java/src/org/broadinstitute/sting/oldindels/CountedObject.java deleted file mode 100755 index d2f7ffc5f..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/CountedObject.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils; - -/** Utility class that makes working with counted objects slightly easier (and faster). - * Consider a "generic" counter representation as Map: updating the counter would require - * int i = map.get(obj).intValue(); i++; map.set(obj,i) - cumbersome, and also inefficient due to extra invocations of Integer - * constructor. This ObjectCounter class can increment its internally kept counter without the need to rebuild any objects, - * so one can use, e.g. "Set myset;" and then "myset.get(obj).increment()". Note that equals() method - * defines counted objects to be the same iff the underlying objects are equal, regardless of the - * counter value. Should the counters be compared, one has to use the getters on the two counted objects - * and compare the results. - * @author asivache - * - */ -public class CountedObject { - private T mObject; - int mCounter; - - /** Creates new counter associated with the passed object and assigns the default count of 1 - * - * @param o object to start counting for - */ - public CountedObject(T o) { - if ( o==null ) throw new StingException("BUG: Can not wrap null as a counted object"); - mObject = o; - mCounter = 1; - } - - /** Creates new counter associated with the object o and assigns specified initial count to it - * - * @param o object to start counting for - * @param n initial count - */ - public CountedObject(T o, int n) { - assert o!=null : "Can not create counted object over null"; - mObject = o; - mCounter = n; - } - - public T getObject() { return mObject; } - public int getCount() { return mCounter; } - public void increment() { mCounter++;} - public void increment(int n) { mCounter+=n; } - public void decrement() { mCounter--; } - public void decrement(int n) { mCounter -= n; } - - @Override - public boolean equals(Object o) { - if ( this == o ) return true; - if ( ! ( o instanceof CountedObject ) ) return false; - if ( ((CountedObject)o).mObject.getClass() != this.mObject.getClass() ) return false; - return mObject.equals(((CountedObject)o).getObject()); - } - - @Override - public int hashCode() { - return mObject.hashCode(); - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/CountedObjectComparatorAdapter.java b/archive/java/src/org/broadinstitute/sting/oldindels/CountedObjectComparatorAdapter.java deleted file mode 100755 index 9d69da144..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/CountedObjectComparatorAdapter.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - - -/** Support class for counted objects. This comparator is an adapter: it is initialized with an arbitrary - * comparator for objects of type T and can be used to directly compare counted objects of type CountedObject - * (the underlying Comparator will be used to compare the "object" part of the counted objects, the counter values - * will be ignored). This comparator also provides additional, non-standard methods that allow direct - * comparison between a CountedObject and "raw" object of type T (the same underlying Comparator will be used, - * and the value of the counter in the counted object wil be ignored). - * @param - */ -public class CountedObjectComparatorAdapter implements java.util.Comparator> { - - private java.util.Comparator mComp; - - /** Initializes comparator adapter with a comparator for objects of trype T */ - public CountedObjectComparatorAdapter(java.util.Comparator adaptee) { - mComp = adaptee; - } - - @Override - public int compare(CountedObject o1, CountedObject o2) { - return mComp.compare(o1.getObject(),o2.getObject()); - } - - public int compare(T o1, CountedObject o2) { - return mComp.compare(o1,o2.getObject()); - } - - public int compare(CountedObject o1, T o2) { - return mComp.compare(o1.getObject(),o2); - } - - @Override - public boolean equals(Object o) { - if ( o instanceof CountedObjectComparatorAdapter) { - if ( ((CountedObjectComparatorAdapter) o).mComp.getClass() == mComp.getClass() ) return true; - } - return false; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingPileReceiver.java b/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingPileReceiver.java deleted file mode 100644 index 6555110f0..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingPileReceiver.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.SAMRecord; - -import java.util.Collection; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 20, 2009 - * Time: 12:55:01 AM - * To change this template use File | Settings | File Templates. - */ -public class DiscardingPileReceiver implements RecordPileReceiver { - @Override - public void receive(Collection c) { - return ; // do nothing, discard the pile. - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingReceiver.java b/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingReceiver.java deleted file mode 100644 index fa7d219f4..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/DiscardingReceiver.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 20, 2009 - * Time: 12:53:53 AM - * To change this template use File | Settings | File Templates. - */ -public class DiscardingReceiver implements RecordReceiver { - @Override - public void receive(SAMRecord r) { - return ;// do nothing, discard the record - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/Indel.java b/archive/java/src/org/broadinstitute/sting/oldindels/Indel.java deleted file mode 100755 index cd79b77a5..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/Indel.java +++ /dev/null @@ -1,153 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import org.broadinstitute.sting.playground.utils.Interval; - -/** This class represents an indel as an interval with respect to the original reference and, in addition, - * stores the indel type ( (I)nsertion or (D)eletion ) and can return meaningful event size (see below). - * Depending on the indel type, the positions on the reference are: - *

    - *
  • Deletion ( e.g. deletion of ACAC from the ref: ACGTT[ACAC]TTTAG to ACGTT[]TTTAG) - start and stop of - * the interval are first and last deleted bases on the original reference (those in square brackets in the - * first sequence in the example) - *
  • Insertion ( e.g. insertion of GTGT into the ref: ACGTT{}TTTAG to ACGTT{GTGT}TTTAG) - start is the first - * position on the original reference after the insertion site (after the '}'), and stop is the last - * position before the insertion site (prior to '{'). - *
- * - * Given these definitions, the length of the interval, as returned by getLength() has the meaning of the length of - * the event (affected bases) on the original reference: number of deleted bases for deletion and zero for insertion. - * The length of the indel itself is returned by getIndelLength(), which is equal to getLength() for deletions and to - * the actual number of inserted bases for insertions (while length on the reference, as returned by getLength() is zero). - * - * The overlaps are also meaningful with the above definitions: if an alignment to (or, in general, an interval on) - * the original reference ends prior to start, or starts after stop, it does not overlap - * with the indel event (neither spans over deleted region or contains any of the inserted bases). - * - */ -public class Indel implements Interval { - - public static enum IndelType { I, D }; - - private long mStart; - private long mLength; - private IndelType mType; - - /** Creates nBases-long indel at specified start position; the object will be unusable - * until indel type is set. - * @param start start position on the reference - * @param nBases number of inserted or deleted bases - */ - //public Indel(long start, long nBases) { - // mType=null; - // mStart=start; - // mLength=nBases; - // } - - /** Creates nBases-long indel of the specified type (insertion or deletion), at specified start position. - * @param start start position on the reference - * @param nBases number of inserted or deleted bases - * @param type Indel type: I or D. - */ - public Indel(long start, long nBases, IndelType type) { - mType=type; - mStart=start; - mLength=nBases; - } - - /** Start coordinate on the reference; for deletions it is the position of the first deleted base, - * for insertions it is the first base after the insertion. - * This is the "left boundary" of the event on the original reference: every alignment that ends - * befor this position on the reference does not overlap with the indel. - * @return indel's left boundary - */ - public long getStart() { return mStart; } - - /** Sets start position of the interval. - * - * @param s start coordinate - */ - public void setStart(long s) { mStart = s; } - - /** Indel's stop coordinate on the reference; for deletions it is the position of the last deleted base, - * for insertions it is the last base before the insertion site (which makes it equal to getStart() - 1). - * This is the "right boundary" of the event: every alignment that starts after - * this position on the reference - * does not overlap with the indel. - * @return indel's right boundary - */ - public long getStop() { - if ( mType == IndelType.I ) return mStart - 1; - else return mStart + mLength - 1; - } - - /** This method is not supported in IndelInterval and will throw an exception. Use setIndelLength() instead. - * - * @param s stop coordinate - */ - public void setStop(long s) { - throw new UnsupportedOperationException("Method setStop(long) is not supported in IndelInterval"); - } - - /** Returns type of this indel ( I or D). - * - * @return I or D enum element - */ - public IndelType getType() { return mType; } - - /** Sets the number of bases in this indel (i.e. the actual number of inserted or - * deleted bases). Stop position will be always correctly computed based on the indel length and indel type. - * @param nBases length of the indel (not the length of the event on the original reference!) - */ - public void setIndelLength(long nBases) { mLength = nBases; } - - /** Returns actual number of inserted or deleted bases in the indel. - * - * @return number of bases (not the event length on the original reference). - * @see #getLength() - */ - public long getIndelLength() { return mLength; } - - /** - * Returns true if this interval overlaps with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * - * @param i Another interval - * @return true iff intervals overlap - */ - - public boolean overlapsP(Interval i) { - return ! disjointP(i); //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Returns true if this interval does not overlap with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * - * @param i Another interval - * @return true iff intervals do not overlap - */ - public boolean disjointP(Interval i) { - return i.getStop() < this.getStart() || i.getStart() > this.getStop(); - } - - /** Returns length of the region affected by the indel on the original reference. Note that an insertion - * has length of 0. - * @return length of the event on the original, unmodified reference - */ - public long getLength() { - if ( mType == IndelType.I ) return 0; - return mLength; - } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof Indel ) ) return false; - Indel i = (Indel)o; - return this.mType == i.mType && this.mStart == i.mStart && this.mLength == i.mLength ; - } - - @Override - public int hashCode() { - return (int)( mStart << 6 + mStart + mLength ); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/IndelGenotyperWalker.java b/archive/java/src/org/broadinstitute/sting/oldindels/IndelGenotyperWalker.java deleted file mode 100644 index 6f16ebc73..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/IndelGenotyperWalker.java +++ /dev/null @@ -1,862 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; - -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; - -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - - -@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class, PlatformUnitFilter.class}) -public class IndelGenotyperWalker extends ReadWalker { - @Argument(fullName="outputFile", shortName="O", doc="output file name (defaults to BED format)", required=true) - java.io.File bed_file; - @Argument(fullName="1kg_format", shortName="1kg", doc="output in 1000 genomes format", required=false) - boolean FORMAT_1KG = false; - @Argument(fullName="somatic", shortName="somatic", - doc="Perform somatic calls; two input alignment files must be specified", required=false) - boolean call_somatic = false; - @Argument(fullName="verbose", shortName="verbose", - doc="Tell us what you are calling now (printed to stdout)", required=false) - boolean verbose = false; - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="must have minCoverage or more reads to call indel; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads to call germline/somatic indel", required=false) - int minNormalCoverage = 4; - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for a consensus call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Minimum fraction of CONSENSUS indel observations at a site wrt all indel observations at the site required to make the call", required=false) - double minConsensusFraction = 0.7; - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated as GENOMIC/UTR/INTRON/CODING", required=false) - String RefseqFileName = null; - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging",required=false) Boolean DEBUG = false; - - private static int WINDOW_SIZE = 200; - private RunningCoverage tumor_coverage; - private RunningCoverage normal_coverage; // when performing somatic calls, we will be using this one for normal, and 'tumor_coverage' for tumor - private int currentContigIndex = -1; - private int currentPosition = -1; // position of the last read we've seen on the current contig - private String refName = null; - private java.io.Writer output = null; - private GenomeLoc location = null; - - private SeekableRODIterator refseqIterator=null; - - private Set normalReadGroups; - private Set tumorReadGroups ; - - private int MISMATCH_WIDTH = 5; // 5 bases on each side of the indel - private int MISMATCH_CUTOFF = 1000000; - private double AV_MISMATCHES_PER_READ = 1.5; - - - private static String annGenomic = "GENOMIC"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - - private SAMRecord lastRead; - - // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" - - @Override - public void initialize() { - normal_coverage = new RunningCoverage(0,WINDOW_SIZE); - - if ( RefseqFileName != null ) { - ReferenceOrderedData refseq = new ReferenceOrderedData("refseq", - new java.io.File(RefseqFileName), rodRefSeq.class); - - refseqIterator = refseq.iterator(); - logger.info("Using RefSeq annotations from "+RefseqFileName); - } - - if ( refseqIterator == null ) logger.info("No annotations available"); - - int nSams = getToolkit().getArguments().samFiles.size(); - - location = GenomeLocParser.createGenomeLoc(0,1); - - List> readGroupSets = getToolkit().getMergedReadGroupsByReaders(); - - if ( call_somatic ) { - if ( nSams != 2 ) { - System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)"); - System.exit(1); - } - tumor_coverage = new RunningCoverage(0,WINDOW_SIZE); - - normalReadGroups = readGroupSets.get(0); // first -I option must specify normal.bam - tumorReadGroups = readGroupSets.get(1); // second -I option must specify tumor.bam - } else { - if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+ - "WARNING: Without --somatic option they will be merged and processed as a single sample"); - } - - try { - output = new java.io.FileWriter(bed_file); - } catch (IOException e) { - throw new StingException("Failed to open file for writing BED output"); - } - } - - - @Override - public Integer map(char[] ref, SAMRecord read) { - - if ( DEBUG ) { -// System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ -// "("+read.getCigarString()+")"); - if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); - } - - if ( AlignmentUtils.isReadUnmapped(read) || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) { - return 0; // we do not need those reads! - } - - if ( read.getReferenceIndex() != currentContigIndex ) { - // we just jumped onto a new contig - - if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal - throw new StingException("Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); - - // print remaining indels from the previous contig (if any); - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); - - currentContigIndex = read.getReferenceIndex(); - currentPosition = read.getAlignmentStart(); - refName = new String(read.getReferenceName()); - - location = GenomeLocParser.setContig(location,refName); - - normal_coverage.clear(); // reset coverage window; this will also set reference position to 0 - if ( call_somatic) tumor_coverage.clear(); - } - - // we have reset the window to the new contig if it was required and emitted everything we collected - // on a previous contig. At this point we are guaranteed that we are set up properly for working - // with the contig of the current read. - - // NOTE: all the sanity checks and error messages below use normal_coverage only. We make sure that normal_coverage and - // tumor_coverage are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe - - if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? - throw new StingException("Read "+read.getReadName() +" out of order on the contig\n"+ - "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition - +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" - +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); - - currentPosition = read.getAlignmentStart(); - - if ( read.getAlignmentStart() < normal_coverage.getStart() ) { - // should never happen - throw new StingException("Read "+read.getReadName()+": out of order on the contig\n"+ - "Read starts at "+read.getReferenceName()+":"+read.getAlignmentStart()+ " (cigar="+read.getCigarString()+ - "); window starts at "+normal_coverage.getStart()); - } - - lastRead = read; - - // a little trick here: we want to make sure that current read completely fits into the current - // window so that we can accumulate the coverage/indel counts over the whole length of the read. - // The ::getAlignmentEnd() method returns the last position on the reference where bases from the - // read actually match (M or D cigar elements). After our cleaning procedure, we can have reads that end - // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, - // in this program we assign insertions, internally, to the first base *after* the insertion position. - // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. - - long alignmentEnd = read.getAlignmentEnd(); - Cigar c = read.getCigar(); - if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I) alignmentEnd++; - - if ( alignmentEnd > normal_coverage.getStop()) { - - // we don't emit anything until we reach a read that does not fit into the current window. - // At that point we shift the window to the start of that read and emit everything prior to - // that position (reads are sorted, so we are not gonna see any more coverage at those lower positions). - // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting - // the window to the read's start will ensure that the read fits... - - if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); - else emit( read.getAlignmentStart(), false ); - - if ( read.getAlignmentEnd() > normal_coverage.getStop()) { - // ooops, looks like the read does not fit into the window even after the latter was shifted!! - throw new StingException("Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small.\n"+ - "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+"; window start (after trying to accomodate the read)="+normal_coverage.getStart()+ - "; window end="+normal_coverage.getStop()); - } - } - - if ( call_somatic ) { - - String rg = (String)read.getAttribute("RG"); - if ( rg == null ) throw new StingException("Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); - - if ( normalReadGroups.contains(rg) ) { - normal_coverage.add(read,ref); - } else if ( tumorReadGroups.contains(rg) ) { - tumor_coverage.add(read,ref); - } else { - throw new StingException("Unrecognized read group in merged stream: "+rg); - } - } else { - normal_coverage.add(read, ref); - } - - return 1; - } - - /** Returns the indel variant with the largest count (ie consensus) among all the observed - * variants, and the total count of all observations of any indels (including non-consensus) - * @param variants - * @return - */ - private Pair findConsensus(List variants) { - int total_variant_count = 0; - int max_variant_count = 0; - IndelVariant v = null; - - for ( IndelVariant var : variants ) { - if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); - int cnt = var.getCount(); - total_variant_count +=cnt; - if ( cnt > max_variant_count ) { - v = var; - max_variant_count = cnt; - } - } - if ( DEBUG ) System.out.println("DEBUG>> Returning: "+v.getBases()+" (cnt="+v.getCount()+") with total count of "+total_variant_count); - return new Pair(v,total_variant_count); - } - - /** Returns true if consensus (specified by the pair) should be considered a call given current values - * of the cutoffs. - * @param p pair with first element being the consensus indel variant, the second element being the total (consensus+others) - * count of indels at the site. - * @param coverage total coverage (number of spanning reads, including those with indel(s)) at the site. - * @return - */ - private boolean isCall(Pair p, int coverage) { - boolean ret = ( p.first.getCount() >= minIndelCount && - (double)p.first.getCount() > minFraction * coverage && - (double) p.first.getCount() > minConsensusFraction*p.second ); - if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+p.first.count+" total_count="+p.second+" cov="+coverage+ - " minConsensusF="+((double)p.first.count)/p.second+" minF="+((double)p.first.count)/coverage); - return ret; - } - - /** Build output line for bed file and write it to the specified output writer if the latter is not null; - * the line is also returned by this method as a String - * - * @param p - * @param coverage - * @param pos - * @param bedOutput - * @return - */ - private String makeBedLine(Pair p, int coverage, long pos, java.io.Writer bedOutput) { - int event_length = p.first.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - StringBuffer message = new StringBuffer(); - message.append(refName+"\t"+(pos-1)+"\t"); - if ( FORMAT_1KG ) - message.append(p.first.getBases().length() + "\t" + (event_length > 0 ? "D" : "I") + "\t" + p.first.getBases() + "\t" + p.first.getSamples()); - else - message.append((pos-1+event_length)+"\t"+(event_length>0? "-":"+")+p.first.getBases() +":"+p.second+"/"+coverage); - - if ( bedOutput != null ) { - try { - bedOutput.write(message.toString()+"\n"); - } catch (IOException e) { - System.out.println(e.getMessage()); - e.printStackTrace(); - throw new StingException("Error encountered while writing into output BED file"); - } - } - return message.toString(); - } - - /** Same as makeBedLine(Pair,int,long,Writer), but only builds and returns the line without writing it anywhere. - * - * @param p - * @param coverage - * @param pos - * @return - */ - private String makeBedLine(Pair p, int coverage, long pos) { - return makeBedLine(p, coverage, pos, null); - } - - /** Output indel calls up to the specified position and shift the coverage array: after this method is executed, the - * first element of the coverage array maps onto 'position', or a few bases to the left of 'position' if we may need more - * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. - * - * @param position - */ - private void emit(long position, boolean force) { - - long move_to = position; // we will shift to position move_to; it's initialized with 'position', - // but it may end up being smaller (delayed shift), if we have not - // covered MISMATCH_WIDTH bases to the right of the last indel yet. - -// boolean debug = false; -// if ( coverage.getStart() <= 19661504 && coverage.getStop() >= 19661504 ) debug = true; - - if ( DEBUG ) System.out.println("DEBUG>> Window: ["+normal_coverage.getStart()+", "+normal_coverage.getStop()+"]; shift requested: to "+position); - - // walk along the coverage window and emit indels up to the position we are trying ot shift the window to - for ( long pos = normal_coverage.getStart() ; pos < Math.min(position,normal_coverage.getStop()+1) ; pos++ ) { - - List variants = normal_coverage.indelsAt(pos); - if ( variants.size() == 0 ) continue; // no indels at current position, go check next one - - // if we are here, we got a variant - - int cov = normal_coverage.coverageAt(pos); // depth of coverage - - if ( cov < minCoverage ) continue; // coverage too low to make a call - - // region around the current indel we need to have covered in order to compute mismatch rate: - long left = Math.max( pos-MISMATCH_WIDTH, normal_coverage.getStart() ); - long right = pos+MISMATCH_WIDTH; - - if ( DEBUG ) System.out.println("DEBUG>> Indel at "+pos); - - if ( right >= position && ! force) { - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = left; - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ left); - break; // abort, don't output current indel - yet - } - - // ok, right < position we are shifting to (or we force-shift), so we already have all the coverage within - // MISMATCH_WINDOW bases around the indel; - // we can proceed with counting mismatches and emitting the indel: - - if ( right > normal_coverage.getStop() ) right = normal_coverage.getStop(); // in case indel is too close to the end of the window but we need to emit (force-shift) - - // count mismatches around the current indel, inside the specified window (MISMATCH_WIDTH on each side): - int total_mismatches = 0; - for ( long k = left; k <= right ; k++ ) total_mismatches+=normal_coverage.mismatchesAt(k); - - if ( total_mismatches > MISMATCH_CUTOFF || total_mismatches > ((double)cov)*AV_MISMATCHES_PER_READ) { - out.println(refName+"\t"+(pos-1)+"\t"+ - "\tTOO DIRTY\t"+total_mismatches); - normal_coverage.indelsAt(pos).clear(); // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - - continue; // too dirty - } - - location = GenomeLocParser.setStart(location,pos); location = GenomeLocParser.setStop(location,pos); // retrieve annotation data - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - - Pair p = findConsensus(variants); - if ( isCall(p,cov) ) { - String message = makeBedLine(p,cov,pos,output); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - if ( verbose ) out.println(message + "\t"+ annotationString); - } - normal_coverage.indelsAt(pos).clear(); // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there will be another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to+" ("+position+")"); - normal_coverage.shift((int)(move_to - normal_coverage.getStart() ) ); - } - - /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed - * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position - * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. - * - * @param position - */ - private void emit_somatic(long position, boolean force) { - - long move_to = position; - - for ( long pos = tumor_coverage.getStart() ; pos < Math.min(position,tumor_coverage.getStop()+1) ; pos++ ) { - - - List tumor_variants = tumor_coverage.indelsAt(pos); - List normal_variants = normal_coverage.indelsAt(pos); - - if ( tumor_variants.size() == 0 ) continue; // no indels in tumor - - - int tumor_cov = tumor_coverage.coverageAt(pos); - int normal_cov = normal_coverage.coverageAt(pos); - - if ( tumor_cov < minCoverage ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumor_cov+" (SKIPPED)"); - } - continue; // low coverage - } - if ( normal_cov < minNormalCoverage ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normal_cov+" (SKIPPED)"); - } - continue; // low coverage - } - - if ( DEBUG ) System.out.println("DEBUG>> Indel in tumor at "+pos); - - long left = Math.max( pos-MISMATCH_WIDTH, tumor_coverage.getStart() ); - long right = pos+MISMATCH_WIDTH; - - if ( right >= position && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = left; - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ left); - break; - } - - if ( right > tumor_coverage.getStop() ) right = tumor_coverage.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right - - // count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side): - int total_mismatches_normal = 0; - int total_mismatches_tumor = 0; - for ( long k = left; k <= right ; k++ ) { - total_mismatches_tumor+=tumor_coverage.mismatchesAt(k); - total_mismatches_normal+=normal_coverage.mismatchesAt(k); - } - - if ( total_mismatches_normal > MISMATCH_CUTOFF || total_mismatches_normal > ((double)normal_cov)*AV_MISMATCHES_PER_READ) { - out.println(refName+"\t"+(pos-1)+"\t"+ - "\tNORMAL TOO DIRTY\t"+total_mismatches_normal); - tumor_coverage.indelsAt(pos).clear(); - normal_coverage.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - continue; // too dirty - } - if ( total_mismatches_tumor > MISMATCH_CUTOFF || total_mismatches_tumor > ((double)tumor_cov)*AV_MISMATCHES_PER_READ) { - out.println(refName+"\t"+(pos-1)+"\t"+ - "\tTUMOR TOO DIRTY\t"+total_mismatches_tumor); - tumor_coverage.indelsAt(pos).clear(); - normal_coverage.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - continue; // too dirty - } - location = GenomeLocParser.setStart(location,pos); location = GenomeLocParser.setStop(location,pos); // retrieve annotation data - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - - Pair p_tumor = findConsensus(tumor_variants); - if ( isCall(p_tumor,tumor_cov) ) { - String message = makeBedLine(p_tumor,tumor_cov,pos); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - - if ( normal_variants.size() == 0 ) { - - try { - output.write(message+"\n"); - } catch (IOException e) { - System.out.println(e.getMessage()); - e.printStackTrace(); - throw new StingException("Error encountered while writing into output BED file"); - } - message += "\tSOMATIC\t0/"+normal_cov; - } else { - Pair p_normal = findConsensus(normal_variants); - - message += "\tGERMLINE\t"+p_normal.second+"/"+normal_cov; - } - if ( verbose ) { - if ( refseqIterator == null ) out.println(message + "\t"); - else out.println(message + "\t"+ annotationString); - } - } - - tumor_coverage.indelsAt(pos).clear(); - normal_coverage.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to+" ("+position+")"); - tumor_coverage.shift((int)(move_to - tumor_coverage.getStart() ) ); - normal_coverage.shift((int)(move_to - normal_coverage.getStart() ) ); - } - - - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( rodRefSeq.isExon(ann) ) { - if ( rodRefSeq.isCoding(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.getRecords().get(0)).getGeneName()); // there is at least one transcript in the list, guaranteed -// while ( it.hasNext() ) { // -// t.getGeneName() -// } - return b.toString(); - } - - } - - @Override - public void onTraversalDone(Integer result) { - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); // emit everything we might have left - try { - output.close(); - } catch (IOException e) { - System.out.println("Failed to close output BED file gracefully, data may be lost"); - e.printStackTrace(); - } - super.onTraversalDone(result); - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value == -1 ) { - onTraversalDone(sum); - System.exit(1); - } - sum += value; - return sum; - } - - @Override - public Integer reduceInit() { - - return new Integer(0); - } - - - static class IndelVariant { - public static enum Type { I, D}; - private String bases; - private Type type; - private int count; - private HashSet samples = new HashSet(); - - public IndelVariant(Type type, String bases) { - this.type = type; - this.bases = bases; - this.count = 1; - } - - public void increment(int i) { - count += i; - } - public void increment() { count+=1; } - - /** Returns length of the event on the reference (number of deleted bases - * for deletions, -1 for insertions. - * @return - */ - public int lengthOnRef() { - if ( type == Type.D ) return bases.length(); - else return 0; - } - - - public void addSample(String sample) { - if ( sample != null ) - samples.add(sample); - } - - public String getSamples() { - StringBuffer sb = new StringBuffer(); - Iterator i = samples.iterator(); - while ( i.hasNext() ) { - sb.append(i.next()); - if ( i.hasNext() ) - sb.append(","); - } - return sb.toString(); - } - - public int getCount() { return count; } - - public String getBases() { return bases; } - - public Type getType() { return type; } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof IndelVariant ) ) return false; - IndelVariant that = (IndelVariant)o; - return ( this.type == that.type && this.bases.equals(that.bases) ); - } - - public boolean equals(Type type, String bases) { - return ( this.type == type && this.bases.equals(bases) ); - } - } - - static class RunningCoverage { - private long start; // we keep coverage starting at this position on the reference - - private CircularArray.Int coverageWindow; - private CircularArray< List< IndelVariant > > indels; - private CircularArray.Int mismatches; - - // Lists will exactly mimic the reads covering corresponding base, in the right order; - // value = 1 if read has a mismatch, 0 otherwise - private CircularArray< List > mm_flags; - - // Lists will exactly mimic the reads covering corresponding base, in the right order; - // i-th value = base quality at this location in the i-th read - private CircularArray< List > base_quals; - - private static List emptyIndelList; - private static Integer ZERO = new Integer(0); - private static Integer ONE = new Integer(1); - - static { - emptyIndelList = new ArrayList(); - } - - public RunningCoverage(long start, int length) { - this.start = start; - coverageWindow = new CircularArray.Int(length); - indels = new CircularArray< List >(length); - mismatches = new CircularArray.Int(length); - } - - /** Returns 1-based reference start position of the interval this object keeps coverage for. - * - * @return - */ - public long getStart() { return start; } - - /** Returns 1-based reference stop position (inclusive) of the interval this object keeps coverage for. - * - * @return - */ - public long getStop() { return start + coverageWindow.length() - 1; } - - /** Returns the number of reads spanning over the specified reference position - * (regardless of whether they have a base or indel at that specific location) - * @param refPos position on the reference; must be within the bounds of the window, - * otherwise IndexOutOfBoundsException will be thrown - */ - public int coverageAt(final long refPos) { - return coverageWindow.get( (int)( refPos - start ) ); - } - - public int mismatchesAt(final long refPos) { return mismatches.get((int)(refPos-start)); } - - public List indelsAt( final long refPos ) { - List l = indels.get((int)( refPos - start )); - if ( l == null ) return emptyIndelList; - else return l; - } - - /** Increments coverage in the currently held window for every position covered by the - * specified read; we count the hole span of read getAlignmentStart()-getAlignmentEnd() here, - * regardless of whether there are indels in the middle.Read must be completely within the current - * window, or an exception will be thrown. - * @param r - */ - public void add(SAMRecord r, char [] ref) { - final long rStart = r.getAlignmentStart(); - final long rStop = r.getAlignmentEnd(); - final String readBases = r.getReadString().toUpperCase(); - - - int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window - - try { - for ( int k = localStart; k <= (int)(rStop-start) ; k++ ) coverageWindow.increment(k, 1); - } catch ( IndexOutOfBoundsException e) { // replace the message and re-throw: - throw new IndexOutOfBoundsException("Current coverage window: "+getStart()+"-"+getStop()+ - "; illegal attempt to add read spanning "+rStart+"-"+rStop); - } - - // now let's extract indels: - - Cigar c = r.getCigar(); - final int nCigarElems = c.numCigarElements(); - - // if read has no indels, there is nothing to do - if ( c.numCigarElements() <= 1 ) return ; - - int posOnRead = 0; - int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: - // its start on the actual full reference contig is r.getAlignmentStart() - // int mm=0; - - for ( int i = 0 ; i < nCigarElems ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - IndelVariant.Type type = null; - String bases = null; - int eventPosition = posOnRef; - - - switch(ce.getOperator()) { - case I: - type = IndelVariant.Type.I; - bases = readBases.substring(posOnRead,posOnRead+ce.getLength()); - // will increment position on the read below, there's no 'break' statement yet... - case H: - case S: - // here we also skip hard and soft-clipped bases on the read; according to SAM format specification, - // alignment start position on the reference points to where the actually aligned - // (not clipped) bases go, so we do not need to increment reference position here - posOnRead += ce.getLength(); - break; - case D: - type = IndelVariant.Type.D; - bases = new String( ref, posOnRef, ce.getLength() ); - posOnRef += ce.getLength(); - break; - case M: for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { - if ( readBases.charAt(posOnRead) != Character.toUpperCase(ref[posOnRef]) ) { // mismatch! - mismatches.increment(localStart+posOnRef, 1); //mm++; - } - } - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); - } - - if ( type == null ) continue; // element was not an indel, go grab next element... - - // we got an indel if we are here... - if ( i == 0 ) logger.debug("Indel at the start of the read "+r.getReadName()); - if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+r.getReadName()); - - try { - // note that here we will be assigning indels to the first deleted base or to the first - // base after insertion, not to the last base before the event! - updateCount(localStart+eventPosition, type, bases, r); - } catch (IndexOutOfBoundsException e) { - System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+ - "Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ - r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ - "; window end="+getStop()); - throw e; - } - } - -// System.out.println(r.getReadName()+"\t"+(r.getReadNegativeStrandFlag()?"RC":"FW")+"\t"+r.getCigarString()+"\t"+mm); -// System.out.println(AlignmentUtils.alignmentToString(r.getCigar(), readBases, new String(ref), 0)); - - } - - /** Convenience shortcut method. Checks if indel of specified type and with specified bases is already recorded - * for position pos (relative to start of the window getStart()). If such indel is found, the counter - * is increased; if it is not found, a new indel (with count = 1, obviously) will be added at that position. If indel array - * still had null at the specified position, this method will instantiate new list of indels for this position - * transparently. - * - * @param pos - * @param type - * @param bases - */ - private void updateCount(int pos, IndelVariant.Type type, String bases, SAMRecord r) { - List indelsAtSite = indels.get(pos); - if ( indelsAtSite == null ) { - indelsAtSite = new ArrayList(); - indels.set(pos, indelsAtSite); - } - - String sample = r.getReadGroup().getSample(); - - boolean found = false; - for ( IndelVariant v : indelsAtSite ) { - if ( ! v.equals(type, bases) ) continue; - - v.increment(); - v.addSample(sample); - found = true; - break; - } - - if ( ! found ) { - IndelVariant v = new IndelVariant(type, bases); - v.addSample(sample); - indelsAtSite.add(v); - - } - } - - /** Resets reference start position to 0 and sets all coverage counts in the window to 0. - * - */ - public void clear() { - start = 0; - coverageWindow.clear(); - indels.clear(); - } - - /** Shifts current window to the right along the reference contig by the specified number of bases. - * Coverage counts computed earlier for the positions that remain in scope will be preserved. - * @param offset - */ - public void shift(int offset) { - start += offset; - coverageWindow.shiftData(offset); - indels.shiftData(offset); - mismatches.shiftData(offset); - } - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/IndelInspectorMain.java b/archive/java/src/org/broadinstitute/sting/oldindels/IndelInspectorMain.java deleted file mode 100755 index 73ac54b5f..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/IndelInspectorMain.java +++ /dev/null @@ -1,339 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.io.File; -import java.util.Map; -import java.util.HashMap; - - -import javax.swing.JFileChooser; -import javax.swing.filechooser.FileNameExtensionFilter; -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.reference.ReferenceSequenceFileWalker; -import net.sf.picard.reference.ReferenceSequence; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.*; - -public class IndelInspectorMain extends CommandLineProgram { - - // Usage and parameters - @Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n"; - @Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE; - @Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION; - @Option(shortName="V",doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS", optional=true) public String VERBOSITY_LEVEL; - @Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved (see OUTF)") public String OUT1; - @Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2; - @Option(doc="Output file (sam or bam) for indel related reads that fail to realign", optional = true ) public String OUTF; - @Option(doc="[paranoid] If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into OUT1", optional=true) public Boolean CONTROL_RUN; - @Option(doc="Error counting mode: MM - mismatches only (from sam tags), MC - mismatches only doing actual mismatch count on the fly (use this if tags are incorrectly set); ERR - errors (arachne style: mm+gap lengths), MG - count mismatches and gaps as one error each") public String ERR_MODE; - @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; - @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; - @Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH; - @Option(doc="Realignment will be attempted around trains of indels with at least one indel observed COUNT_CUTOFF times or more",optional=true) public Integer COUNT_CUTOFF; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new IndelInspectorMain().instanceMain(argv)); - } - - protected int doWork() { - - int discarded_cigar_count = 0; - int discarded_long_read_count = 0; - int discarded_maxerr = 0; - int reads_accepted = 0; - int reads_with_indels_accepted = 0; - - ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( - REF_FILE - ); - - if ( reference.getSequenceDictionary() == null ) { - System.out.println("No reference sequence dictionary found. Abort."); - } - - GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary()); - GenomeLoc location = null; - if ( GENOME_LOCATION != null ) { - location = GenomeLocParser.parseGenomeLoc(GENOME_LOCATION); - } - - if ( COUNT_CUTOFF == null ) COUNT_CUTOFF = 2; - - if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") && ! ERR_MODE.equals("MC") ) { - System.out.println("Unknown value specified for ERR_MODE: "+ERR_MODE); - return 1; - } - - final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/")); - samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - // setContigOrdering(samReader); - - - if ( MAX_READ_LENGTH == null ) MAX_READ_LENGTH = 1000000000; - - ReferenceSequence contig_seq = null; - - IndelRecordPileCollector col = null; - PassThroughWriter ptWriter = new PassThroughWriter(OUT1,samReader.getFileHeader()); - PassThroughWriter ptFailedWriter = null; - if ( OUTF != null ) ptFailedWriter = new PassThroughWriter(OUTF,samReader.getFileHeader()); - PileBuilder pileBuilder = null; - if ( CONTROL_RUN == null ) CONTROL_RUN=false; - if ( ! CONTROL_RUN ) pileBuilder = new PileBuilder(OUT2,samReader.getFileHeader(), ptFailedWriter == null? ptWriter : ptFailedWriter); - - try { - if ( CONTROL_RUN ) col = new IndelRecordPileCollector(ptWriter, new DiscardingPileReceiver() ); - else col = new IndelRecordPileCollector(ptWriter, pileBuilder ); - } catch(Exception e) { System.err.println(e.getMessage()); } - if ( col == null ) return 1; - - col.setControlRun(CONTROL_RUN); - col.setIndelCountAcceptanceCutoff(COUNT_CUTOFF); - - if ( ! CONTROL_RUN ) { - if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT"); - if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(PileBuilder.SILENT); - else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(PileBuilder.PILESUMMARY); - else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(PileBuilder.ALIGNMENTS); - else { - System.out.println("Unrecognized VERBOSITY_LEVEL setting."); - return 1; - } - } - - String cur_contig = null; - long t=0,tc=System.currentTimeMillis(); // time - boolean done_printing = false; - - for ( SAMRecord r : samReader ) { - - if ( r.getReadUnmappedFlag() ) { continue; } - if ( r.getReferenceName() != cur_contig) { - cur_contig = r.getReferenceName(); - System.out.println("Contig "+cur_contig); - // if contig is specified and we are past that contig, we are done: - if ( location != null && GenomeLocParser.compareContigs(cur_contig, location.getContig()) == 1 ) break; - if ( location == null || GenomeLocParser.compareContigs(cur_contig, location.getContig()) == 0 ) { - if ( location != null ) System.out.println("Time spent to scroll input bam file to the specified chromosome: "+ ((System.currentTimeMillis()-tc)/1000) + " seconds."); - tc = System.currentTimeMillis(); - contig_seq = reference.get(r.getReferenceIndex()); - t = System.currentTimeMillis(); - String refstr = new String(contig_seq.getBases()); - if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr); - System.out.println("Contig "+cur_contig+" (index="+r.getReferenceIndex()+") loaded in "+ ((t-tc)/1000) +" seconds; length="+contig_seq.getBases().length+" tst="+contig_seq.toString()); - } - } - - // if contig is specified and we did not reach it yet, skip the records until we reach that contig: - if ( location != null && GenomeLocParser.compareContigs(cur_contig, location.getContig()) == -1 ) continue; - - if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue; - - if ( location != null && ! done_printing ) { - System.out.println("Time spent to scroll input bam file to the specified location on the chromosome: " + ((System.currentTimeMillis()-t)/1000)+" seconds."); - done_printing = true; - } - // if stop position is specified and we are past that, stop reading: - if ( location != null && r.getAlignmentStart() > location.getStop() ) break; - - // if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now - - // we currently do not know how to deal with cigars containing elements other than M,I,D, so - // let's just skip the reads that contain those other elements (clipped reads?) - Cigar c = r.getCigar(); - boolean cigar_acceptable = true; - boolean has_indel = false; - - for ( int z = 0 ; z < c.numCigarElements() ; z++ ) { - CigarElement ce = c.getCigarElement(z); - switch ( ce.getOperator() ) { - case M: break; - case I: - case D: has_indel = true; break; - default: - cigar_acceptable = false; - } - } - if ( ! cigar_acceptable ) { - discarded_cigar_count++; - continue; - } - - if ( r.getReadLength() > MAX_READ_LENGTH ) { - discarded_long_read_count++; - continue; - } - - int err = -1; -/* - System.out.println("MM: "+numMismatches(r)); - System.out.println("direct: "+numMismatchesDirect(r,contig_seq)); - System.out.print(" "); - for ( int i = r.getAlignmentStart() - 1 ; i < r.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]); - System.out.println(); - System.out.println((r.getReadNegativeStrandFlag()?"<-":"->")+r.getReadString()); - System.out.println("cigar: "+r.getCigarString()); - System.out.println(); - if (counter++ == 20 ) break; - continue; -*/ - - if ( ERR_MODE.equals("MM")) err = numMismatches(r,contig_seq); - else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq); - else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq); - else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq); - if ( err > MAX_ERRS.intValue() ) { - discarded_maxerr++; - continue; - } - - reads_accepted++; - if ( has_indel ) reads_with_indels_accepted++; - // counter++; - // if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); - col.receive(r); - - } - - if ( ! CONTROL_RUN ) { - pileBuilder.printStats(); - pileBuilder.close(); - } - System.out.println("done."); - System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count); - System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count); - System.out.println("Discarded reads with error counts above "+MAX_ERRS+ ": "+ discarded_maxerr); - System.out.println("Reads passed to realigner: "+ reads_accepted+" total; "+reads_with_indels_accepted+" with indel(s)"); - System.out.println(); - col.printLengthHistograms(); - samReader.close(); - ptWriter.close(); - if ( ptFailedWriter != null ) ptFailedWriter.close(); - return 0; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD - * - * @param r SAM record that must specify an alignment - * @return number of errors (number of mismatches plus total length of all insertions/deletions - * @throws RuntimeException - */ - private static int numErrors(SAMRecord r, ReferenceSequence refseq) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - int errs = numMismatches(r,refseq); - - // now we have to add the total length of all indels: - Cigar c = r.getCigar(); - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator()) { - case M : break; // we already have correct number of mismatches - case I : - case D : - errs += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return errs; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD - * - * @param r SAM record that must specify an alignment - * @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or - * deletion will be counted as a single error regardless of the length) - * @throws RuntimeException - */ - private static int numMismatchesGaps(SAMRecord r,ReferenceSequence refseq) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - int errs = numMismatches(r,refseq); - - // now we have to add the total length of all indels: - Cigar c = r.getCigar(); - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator()) { - case M : break; // we already have correct number of mismatches - case I : - case D : - errs++; - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return errs; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ - private static int numMismatches(SAMRecord r, ReferenceSequence refseq) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - Integer i = (Integer)r.getAttribute("NM"); - if ( i == null ) return AlignmentUtils.numMismatches(r,refseq); - return ((Integer)r.getAttribute("NM")).intValue() - 1; - - } - - /** Trivial utility method that goes some distance trying to ensure that the input file is there; - * the only purpose is reducing clutter in main(). Receives a default - * input file argument, does a few checks (e.g. that it is non-null and exists), if they fail tries - * to fire up a file chooser dialog using start_folder as initial directory, etc. - * @param default_arg some "default" input file; if it is non-null and exists, nothing else will be done, - * and the same default_arg objetc will be returned; otherwise the method will try to ask for a "better" input. - * @param start_folder should file open dialog be fired up, it will initially display this directory. - * @return File object that is not null and does exist (there is no check that it is a valid SAM/BAM file though). - */ - private File getInputFile(File default_arg, String start_folder) { - File f = default_arg; - if ( f==null || ! f.exists() ) { - JFileChooser fc = new JFileChooser(start_folder); - FileNameExtensionFilter ff = new FileNameExtensionFilter("SAM and BAM files","sam","bam"); - fc.setFileFilter(ff); - fc.setFileSelectionMode(JFileChooser.FILES_ONLY); - - int ret = fc.showOpenDialog(null); - f = fc.getSelectedFile(); - if ( ret != JFileChooser.APPROVE_OPTION ) { - System.out.println("No input file specified. Exiting..."); - System.exit(1); - } - } - - if ( f == null || ! f.exists() ) { - System.out.println("SAM or BAM input file must be specified. Exiting..."); - System.exit(1); - } - - return f; - } - - /** Auxiliary method to remove some clutter from main(); gets called only once and tries to get - * contig ordering from the header provided by opened SAM reader; if no header info is available - * falls back to default ordering; whichever ordering is used, it is set for GenomeLoc class. - * @param r sam reader to get header from - */ - private void setContigOrdering(SAMFileReader r) { - SAMFileHeader h = r.getFileHeader(); - if ( h == null ) { - System.out.println("No header found in SAM file, falling back to default contig ordering"); - setDefaultContigOrdering(); - return; - } - GenomeLocParser.setupRefContigOrdering(h.getSequenceDictionary()); - } - - private void setDefaultContigOrdering() { - Map rco = new HashMap(); - rco.put("chrM",0); - for ( int i = 1 ; i <= 22 ; i++ ) rco.put(Integer.toString(i),i);//rco.put("chr"+i,i); - rco.put("chrX",23); - rco.put("chrY",24); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/IndelRecordPileCollector.java b/archive/java/src/org/broadinstitute/sting/oldindels/IndelRecordPileCollector.java deleted file mode 100755 index 4d720af66..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/IndelRecordPileCollector.java +++ /dev/null @@ -1,541 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.*; - - -import org.broadinstitute.sting.playground.indels.Indel.IndelType; -import org.broadinstitute.sting.playground.utils.*; -import net.sf.samtools.*; - -/** Ultimately, this class is a splitter for a stream of alignment records. It detects putative indels, or - * trains of sufficiently close indels, and sends the alignments two-way: those that do not overlap with any - * detected indels or trains of indels, and those that do. The latters are emitted in finished piles of - * all alignments that overlap with genomic interval of interest. This collector should be bound to - * and driven by an alignment traversal engine that sends alignment records one by one. - * - * NOTE 1: alignments must be sent to the collector strictly in the order - * of non-decreasing reference start position. - * - * NOTE 2: a train of indels is defined as a sequence of (putative) indels such that each pair of adjacent indels - * is overlapped by at least one alignment (that alignment does not have to have both indels in it, but only - * to span over both positions). A "genomic region of interest" is defined as the smallest interval - * containing all indels in the train, and all the alignments that overlap with that region will be collected - * into one pile. For instance, if reads of different length are present in the dataset, it is possible that two - * adjacent indels are overlapped by a single longer read (which stitches them into the train), but there are - * shorter reads that fall completely into the in-between region (so that they technically do not overlap with any - * indels in the train). According to the above definition, these shorter reads will be still emitted into the pile, - * since they overlap with the "region of interest". - * - * NOTE 3: due to performance/memory issues, the collector may refuse to assemble a pile over pathologically long - * train of indels. In this case, it will keep detecting the indel train in order to be able to understand what is - * going on and to recover later, but the reads will be sent to the "non-overlapping" output channel. - * - * In order to abstract and decouple the operation of emitting records, the collector expects to be bound to an - * implementation of RecordEmitter interface. It is the emitter's implementation that decides what to do with - * alignments of the two types (not related to indels vs. piles of alignments overlapping with indels). While - * this collector has some delay between receiving an alignment and being able to decide which way it should go, - * no records are ever discarded. - * - * Implementation note: - * - * In order to achive its goal, the collector has a state ('wait' or 'active') and always - * keeps a variable size "backlog" pile of alignments that were sent to it most recently. In 'wait' state collector - * has not detected any putative indels just yet. The backlog pile contains only alignments of "undecided fate": those - * that still might overlap with an indel should it be detected in the future. All alignments that end before the - * current position on the genome have their fate determined (as not overlapping with any indels) and emitted. - * When an indel is encountered, the collector flips into the 'active' state and from that moment on keeps all - * the alignments in the pile and collects information on the indels (their positions on the reference and numbers - * of observations). - * - * Since only alignments are sorted, but not indels (an indel in a later read may occur closer - * to its start and thus before a previously seen indel), and also because it is relatively difficult (TO_DO?)to break a - * pile in the middle immediately when it becomes clear that two adjacent indels might have been overlapped by a - * single read, but no such read ever surfaced, the collector is conservative at this stage and keeps - * accumulating the pile (and indel train) until it moves sufficiently far away from the last indel seen (full - * maximum read length is recommended). Then it switches back into wait state and performs post-processing - * of the indel train and the collected pile: only at this stage the preliminary pile is closely examined and if - * there are pairs of adjacent indels not spanned by any read, the pile is broken into smaller piles - * that conform to the contract outlined above. These piles are directed into the RecordEmitter, and the - * reads that fall in between the piles, if any (i.e. those that do not overlap with final indel trains - * determined at the post-processing stage) are relabeled as "not interesting" and redirected - * to the appropriate output channel. - * - * @author asivache - * - */ -public class IndelRecordPileCollector implements RecordReceiver { - - private final int WAIT_STATE = 0; - private final int ACTIVE_STATE = 1; - private int INDEL_COUNT_CUTOFF = 2; - - private boolean avoiding_region; // some regions are too funky (contain very long indel trains)- - // we will determine their span and report them, - // but we won't be counting any indels there or building piles - - private List mRecordPile; // here we keep the records before we decide how we want to emit them - private TreeSet > mAllIndels; ///< individual indels encountered, with observation counts - private int mLastContig ; ///< keeps the index of the contig last alignment was on - private int mLastStartOnRef; ///< keeps the start position of the last alignment - private int mState; ///< WAIT_STATE or ACTIVE_STATE - private int mIndelSeparation; ///< Indels that are farther away from one another than this value - ///< will be emitted separately; trains of indels with less then - ///< mIndelSeparation bases between each adjacent pair will be emitted - ///< as one pile. - - // we will build histograms (distributions) of encountered indel lengths on the fly - private List mIndelLengthHistI; - private List mIndelLengthHistD; - - private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest - private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there - - private boolean controlRun = false; - - public String memStatsString() { - String s = "mRecordPile: "; - return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef; - //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; - } - - public void setIndelCountAcceptanceCutoff(int n) { INDEL_COUNT_CUTOFF = n; } - - public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException { - mRecordPile = new LinkedList(); - mAllIndels = new TreeSet >( - new CountedObjectComparatorAdapter(new IntervalComparator())); - mLastContig = -1; - mLastStartOnRef = -1; - mIndelSeparation = 51; - mIndelLengthHistI = new ArrayList(); - mIndelLengthHistD = new ArrayList(); - for ( int i = 0 ; i < 5 ; i++ ) { - mIndelLengthHistI.add(0); - mIndelLengthHistD.add(0); - } - defaultReceiver = rr; - indelPileReceiver = rp; - setWaitState(); - } - - /** Fully reinitializes wait state: clears record pile and indel list, resets flags and states. - * Does not emit records, just clears/resets the variables. - */ - private void setWaitState() { - - mRecordPile.clear(); - mAllIndels.clear(); -// mIndelRegionStart = 1000000000; -// mIndelRegionStop = -1; - avoiding_region = false; - mState = WAIT_STATE; // got to do this if we were in avoid_region state - } - - public void setControlRun(boolean c) { controlRun = c; } - - - /** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile - * all the consequtive records with alignment end positions less than or equal to the specified - * position pos, until the first record is encountered that does not meet this condition. Note that - * there might be more alignments that end at or before pos later on in the pile, but - * they will nit be emitted/removed by this method. - * @param pos all leading records with alignments ending before or at this position will be purged from the pile, - * up to the first record that does not end at or before pos. - */ - protected void purgeRecordsEndingAtOrBefore(final long pos) { - Iterator i = mRecordPile.iterator(); - while ( i.hasNext() ) { - SAMRecord r = i.next(); - if ( r.getAlignmentEnd() <= pos ) { - defaultReceiver.receive(r); - i.remove(); - } else break; - } - } - - /** A utility method: purges from the currently held SAM record pile all the records with alignment - * start positions greater than or equal to the specified position pos - * @param pos all records with alignments starting at or after this position will be purged from the pile - */ - protected void purgeRecordsStartingAtOrAfter(final int pos) { - Iterator i = mRecordPile.iterator(); - while ( i.hasNext() ) { - SAMRecord r = i.next(); - if ( r.getAlignmentStart() >= pos ) { - defaultReceiver.receive(r); - i.remove(); - } else break; - } - } - - /** This method MUST be called when no more reads are left in order to enforce the collector to emit the current pile of reads - * it is still holding. - */ - public void close() { - emit(); - } - - /** This is the main interface method of the collector: it receives alignments, inspects them, detects indels, - * updates and purges the read pile it keeps and emits alignments as needed. - * Depending on the state, the following behaviors are possible - * - *
    - *
  • If the collector is in wait state (no indels seen recently): all - * alignments that end prior to the start of currently inspected alignment can not overlap - * with any future indels, including those that may be present in the current alignment; these records - * get purged from the pile and emitted immediately. Current alignment gets added to the pile. - * If current alignment has indels, collector switches into 'active' state. - *
  • in active state: if the current alignment starts sufficiently far away from the last indel seen, - * examine the currently held pile closely, split into a few separate piles/indel trains if needed, emit and - * completely purge the pile, add alignment to the pile, switch to wait state if alignment has no indels or - * stay in active state if it does. Otherwise (alignment too close to last indel), - * just add alignment to the pile, since it is yet impossible to tell whether new indels are coming soon and - * indel train will need to be extended; if alignment does have indels of its own, add them - * to the current indel train - *
- * - * This method checks that records arrive in reference-sorted order and throws RuntimeException if out-of-order - * record arrives. - * - * @param r - * @throws RuntimeException - */ - public void receive(final SAMRecord r) throws RuntimeException { - - if ( r.getReadUnmappedFlag() ) { - defaultReceiver.receive(r); // do not throw reads away even if they are of no use for us, keep them in the output bam.... - return; // read did not align, nothing to do - } - - if ( controlRun ) { - defaultReceiver.receive(r); - return; - } - - int currContig = r.getReferenceIndex(); - int currPos = r.getAlignmentStart(); - - if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs"); - if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions"); - - if ( currContig > mLastContig ) { - // we jumped onto a new contig; emit everything we might have been building and purge the piles: - emit(); - } else { // still on the same contig: - - switch (mState) { - // everything ending up to currPos is guaranteed to have no overlaps with indels yet to come - case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break; - - // next indel can start only after currPos (whether it is in the current read or in the - // reads yet to come). If it is far enough from the last indel we have seen, we can emit - case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break; - default: throw new RuntimeException("Unknown state"); - } - } - - // does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active' - extractIndelsAndUpdateState(r.getCigar(),currPos); - - if ( mState == ACTIVE_STATE && ( ! avoiding_region ) && ( mAllIndels.size() > 20 || mRecordPile.size() > 1000 ) ) { - avoiding_region = true; - } - - if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region - else defaultReceiver.receive(r); // if we do not want to or can not deal with a region, pass reads through; - // the pile we have already collected before discovering it's a bad region will be sent through on the next call to emit() - - mLastContig = currContig; - mLastStartOnRef = currPos; - - } - - /** Emits all reads from the currently held pile, cleans the pile and fully reinitializes wait state - * (clears indel list etc). - * - * If the current state is 'wait', simply sends all the records from the pile to nonindelReceiver before - * the cleanup. If the state is 'active', then performs final inspection of the pile built over a train of indels, - * splits the train (and the pile) into multiple trains/piles as needed (i.e. if there are pairs of adjacent - * indels that are not overlapped by any read), and emits the final piles of records into indelReceiver. - */ - private void emit() { - - if ( mState == WAIT_STATE || avoiding_region ) { - // System.out.println("Emitting uninteresting pile"); - if ( avoiding_region ) { - long start = mAllIndels.first().getObject().getStart(); - long stop = mAllIndels.last().getObject().getStop(); - System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop + - " was ignored: "); - System.out.println(" "+mAllIndels.size() +" unique indels with average distance of "+ - ((double)(stop - start))/((double)mAllIndels.size()-1) + - " bases between indels"); - System.out.println(" "+mRecordPile.size() +" reads collected before aborting"); - } - - // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile - for ( SAMRecord r : mRecordPile ) { - defaultReceiver.receive(r); - } - setWaitState(); - return; - } - - // last minute cleanup: - // at this stage we have all the indels collected conservatively (in a sense - // that they can be farther away than it is needed) - this means that there actually - // can be more than one pile in what we have stored. Also, we can still have gapless reads - // at the ends of the piles that do not really overlap with indel sites. - - // System.out.println("Emitting pile with indels ("+mRecordPile.size()+" reads, "+mAllIndels.size()+" indels)"); - if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels"); - - HistogramAsNeeded(mAllIndels); - - - // indels are in a sorted map, and reads were added to the pile in the order they were received (also sorted). - // we will traverse the two collections in parallel and detect exactly where we can break the indel train into - // subtrains - Iterator > i_iter = mAllIndels.iterator(); - - // will keep list of indels and list of records, respectively, in one final train - List< CountedObject > finalTrain = new ArrayList>(); - List< SAMRecord > finalPile = new ArrayList(); - - long curr_stop = -1; // the rightmost stop position among all the alignments seen so far - - CountedObject indel = i_iter.next(); // we checked that list of indels contains at least one element! - - SAMRecord record ; - - while ( indel != null ) { - - // first, if we just started new indel train, then emit into defaultReceiver all alignments - // that end prior to the first indel in the train: - if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1); - - finalTrain.add(indel); - - Iterator r_iter = mRecordPile.iterator(); - - if ( r_iter.hasNext() ) record = r_iter.next(); - else record = null; - - // record now contains first alignment that ends in or after the indel, or null if there are no more records - - // now collect all the alignments that overlap with the current indel (start before or inside) and - // record the rightmost alignment stop position: - while ( record != null && record.getAlignmentStart() <= indel.getObject().getStop() ) { - finalPile.add(record); - r_iter.remove(); // remove from the original pile the record we just moved to the current final pile - curr_stop = Math.max(curr_stop, record.getAlignmentEnd()); - if ( r_iter.hasNext() ) record = r_iter.next(); - else record = null; - } - - // record is now the first alignment that starts after the indel, or null if there are no more records - - // we are done with current indel, get next one if any: - if ( i_iter.hasNext() ) { - indel = i_iter.next(); - } else indel = null; - if ( indel == null || curr_stop < indel.getObject().getStart() ) { - // if there are no more indels or - // all alignments that overlapped with the previous indel ended before the current indel started, - // this means that the current train and pile of reads overlapping with it are fully built - // and can be emitted - - if ( shouldAcceptForOutput(finalTrain ) ) { - System.out.print("SITE: " + mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" + - finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " + - finalTrain.size() + " indels; "); - System.out.print(finalPile.size() + " reads in the pile;") ; - System.out.println(formatRange(finalTrain)); - indelPileReceiver.receive(finalPile); - } else { - for ( SAMRecord r : finalPile ) { - defaultReceiver.receive(r); - } - } - finalPile.clear(); - finalTrain.clear(); - curr_stop = -1; - } // ELSE: otherwise we have reads that overlap with both previous and current indel, so we just continue - // with building the indel train - } - - // we may still have reads in the original pile that start after the last indel: - for ( SAMRecord r : mRecordPile ) { - defaultReceiver.receive(r); - } - - setWaitState(); - } - - - /** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets - * the state to 'active'. If cigar contains no indels, this method does not do anything (it does not - * set state back to 'wait' either!). If this method finds any indels in the cigar, it first tries to find them - * in the list of previously seen indels. If the indel was already seen before, its counter is updated (indels - * are stored in the list as counted objects), oherwise indel is added to the list with initial count of 1. - * - * @param c alignment cigar; if it contains no indels, nothing will be done - * @param start position, at which the alignment represented by cigar c starts on the reference - */ - private void extractIndelsAndUpdateState(final Cigar c, final int start) { - // - // firstpos,lastpos span of the indel will be interpreted as follows: - // any alignment that ends strictly before firstpos or starts strictly after lastpos - // on the *reference* (not inclusive!) does not overlap with an indel; in the case of - // insertion it will result in firstpos > lastpos! - // lastpos - // | firstpos - // | | - // v v - // ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts - // after lastpos or ends before firstpos *on the reference* - // is completely over the reference bases to the right or to - // the left, respectively, of the insertion site - // - // firstpos - // | lastpos - // | | - // v v - //------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos - // -----DDD--- alignment on the reference does not overlap with the deletion - int runninglength = start; // position on the original reference; start = alignment start position - - if ( c.numCigarElements() == 1 ) return; // most of the reads have no indels, save a few cycles by returning early - - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - Indel indel = null; - - switch(ce.getOperator()) { - case I: indel = new Indel(runninglength, ce.getLength(), IndelType.I); break; - case D: indel = new Indel(runninglength, ce.getLength(), IndelType.D); - runninglength += ce.getLength(); - break; - case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string"); - } - - if ( indel == null ) continue; // element was not an indel, go grab next element... - - mState = ACTIVE_STATE; // this is redundant and will be executed unnecessarily many times, but it's cheap... - - CountedObject indelWithCount = new CountedObject(indel); - CountedObject found = mAllIndels.floor(indelWithCount); - - if ( indelWithCount.equals( found ) ) found.increment(); // we did find our indel, advance the counter - else mAllIndels.add(indelWithCount); // this is a new indel. Add it. - } // end for loop over all alignment cigar elements - - } // end extractIndels() method - - - - /** Counts the size of the passed argument into the appropriate size histogram - * - * @param indel size of this indel will be counted in - */ - private void addToSizeHistogram(Indel indel) { - // count this indel's size into the appropriate bin of the appropriate histogram - // (we count insertions and deletions separately), resizing the histogram array if needed: - List histogram; - if ( indel.getType() == Indel.IndelType.D ) { - histogram = mIndelLengthHistD; - } else if ( indel.getType() == Indel.IndelType.I ) { - histogram = mIndelLengthHistI; - } else { - throw new RuntimeException("Indel of unknown type"); - } - if( indel.getIndelLength() > histogram.size() ) { - for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0); - histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1 - } else { - int n = histogram.get((int)indel.getIndelLength()-1); - histogram.set((int)indel.getIndelLength()-1, n+1); - } - } - - /** Adds sizes of the indels from the list that pass some filters to the histograms - * - * @param indels collection of indels with counts - */ - private void HistogramAsNeeded(Collection> indels) { - for ( CountedObject o : indels ) { - if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject()); - } - } - - /** Returns true if an attempt should be made to clean alignments around the - * specified indel train; currently, indel run is acceptable - * if it contains at least one indel onbserved more than once. - * @param indels list of indels with counts to check for being acceptable - * @return true if the indel run has to be printed - */ - private boolean shouldAcceptForOutput(List> indels) { - for ( CountedObject o : indels ) { - if ( o.getCount() >= INDEL_COUNT_CUTOFF ) return true; - } - return false; - } - - private String formatRange(List> indels) { - StringBuffer b = new StringBuffer(); - StringBuffer all = new StringBuffer(); - - long min = 1000000000; - long max = 0; - - all.append("; passing indels:"); - for ( CountedObject o : indels ) { - if ( o.getCount() < 2 ) continue; - all.append(" "); - all.append(o.getObject().getIndelLength()); - if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength(); - if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength(); - } - if ( max == 0 ) return new String(); // no passinf indels, return empty string - - b.append(" passing min length: "); - b.append(min); - b.append("; passing max length: "); - b.append(max); - b.append(all); - return b.toString(); - } - - public void printLengthHistograms() { - if ( mIndelLengthHistD.size() < mIndelLengthHistI.size() ) { - for ( int i = mIndelLengthHistD.size(); i < mIndelLengthHistI.size(); i++ ) mIndelLengthHistD.add(0); - } - if ( mIndelLengthHistI.size() < mIndelLengthHistD.size() ) { - for ( int i = mIndelLengthHistI.size(); i < mIndelLengthHistD.size(); i++ ) mIndelLengthHistI.add(0); - } - System.out.println("length n_insertions n_deletions"); - for ( int i = 0 ; i < mIndelLengthHistD.size(); i++ ) { - System.out.println((i+1)+" "+mIndelLengthHistI.get(i)+" "+mIndelLengthHistD.get(i)); - } - } - - /** Returns true iff the SAM record (or, strictly speaking, its cigar) has at least one insertion or deletion - * - * @param r record to analyze - * @return true if cigar contains at least one I or D element, false otherwise - */ -// private boolean hasIndel(SAMRecord r) { -// Cigar c = r.getCigar(); -// for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { -// CigarOperator co = c.getCigarElement(i).getOperator(); -// if ( co.equals(CigarOperator.I) || co.equals(CigarOperator.D) ) { -// // we got an indel! -// return true; -// } -// } -// return false; -// } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/IndexedSequence.java b/archive/java/src/org/broadinstitute/sting/oldindels/IndexedSequence.java deleted file mode 100755 index 02b83d100..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/IndexedSequence.java +++ /dev/null @@ -1,95 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Map.Entry; - -public class IndexedSequence implements Iterable>> { - private Map > m_locs; - private String m_seq; - private int m_K; - - public IndexedSequence(String seq, int K) { - m_locs = new Hashtable >(); - m_seq = new String(seq); - m_K = K; - Iterator iter = new KmerCodeIterator(seq, K); - int offset = 0; - while ( iter.hasNext() ) { - Short k = iter.next(); - List offset_list = m_locs.get(k); - if ( offset_list == null ) { - offset_list = new ArrayList(); - m_locs.put(k,offset_list); - } - offset_list.add(offset++); - } - } - - public Iterator>> iterator() { - return m_locs.entrySet().iterator(); - } - - public int length() { return m_seq.length(); } - - public List getOffsets(short k) { return m_locs.get(k); } - - String getSequence() { return m_seq; } - - public int getK() { return m_K; } - - public static void testMe() { - String s = "ACCGTGCGGGCACCTGC"; - int K = 3; - IndexedSequence is = new IndexedSequence(s,K); - System.out.println("Sequence: "+ s); - System.out.print(" "); - for ( int i= 0 ; i < s.length() ; i++ ) { - if ( i % 10 == 0 ) System.out.print(i/10); - else System.out.print(' '); - } - System.out.println(); - System.out.print(" "); - for ( int i= 0 ; i < s.length() ; i++ ) System.out.print(i%10); - System.out.println(); - System.out.println(); - - System.out.println("Indexing with K="+K+":"); - - Set< Map.Entry > > data = is.m_locs.entrySet(); - for ( Map.Entry > e : data ) { -// System.out.print("("+e.getKey()+") "); - System.out.print(kmerToString(e.getKey().shortValue(),K)); - System.out.print("-->"); - for ( Integer offset : e.getValue() ) { - System.out.print(" "+offset.toString()); - } - System.out.println(); - } - } - - private static String kmerToString(short code, int K) { - StringBuffer b = new StringBuffer(K); - for ( int i = 0 ; i < K ; i++ ) { - char c='N'; - switch( code & 0x3 ) { - case 0 : c = 'A' ; break; - case 1 : c = 'C' ; break; - case 2 : c = 'G' ; break; - case 3 : c = 'T' ; break; - } - b.append(c); - code >>= 2; - } - return b.reverse().toString(); - } - - public static void main(String argv[]) { - testMe(); - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/Interval.java b/archive/java/src/org/broadinstitute/sting/oldindels/Interval.java deleted file mode 100644 index 717f17133..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/Interval.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 19, 2009 - * Time: 12:03:39 PM - * To change this template use File | Settings | File Templates. - */ - -/** Abstraction of a closed interval [start,stop] - * - */ -public interface Interval { - /** Start position of the interval. - * - * @return for the interval [start,stop] - */ - public long getStart(); - - /** Sets start position of the interval. - * - * @param s start coordinate - */ - public void setStart(long s); - - /** End position of the interval. - * - * @return for the interval [start,stop] - */ - public long getStop(); - - /** Sets stop position of the interval. - * - * @param s stop coordinate - */ - public void setStop(long s); - - /** Length of the interval. There is currently no contract, an implementation may return negative length - * or a length inconsistent with getStop() - getStart() + 1 if it chooses so. - * - * @return a number representing the length of the interval according to specific implementation - */ - public long getLength(); - - /** Returns true if this interval overlaps with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * @param i Another interval - * @return true iff intervals overlap - */ - public boolean overlapsP(org.broadinstitute.sting.playground.utils.Interval i); - - /** Returns true if this interval does not overlap with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * @param i Another interval - * @return true iff intervals do not overlap - */ - public boolean disjointP(org.broadinstitute.sting.playground.utils.Interval i); -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/IntervalComparator.java b/archive/java/src/org/broadinstitute/sting/oldindels/IntervalComparator.java deleted file mode 100755 index 8e0d38bce..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/IntervalComparator.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import org.broadinstitute.sting.playground.utils.Interval; - - -public class IntervalComparator implements java.util.Comparator { - public int compare(Interval r1, Interval r2) { - if ( r1.getStart() < r2.getStart() ) return -1; - if ( r1.getStart() == r2.getStart() ) { - if ( r1.getStop() < r2.getStop() ) return -1; - if ( r1.getStop() == r2.getStop() ) return 0; - } - return 1; - } - - @Override - public boolean equals(Object o) { - if ( o instanceof IntervalComparator) return true; - return false; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/KmerCodeIterator.java b/archive/java/src/org/broadinstitute/sting/oldindels/KmerCodeIterator.java deleted file mode 100755 index 4bce27939..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/KmerCodeIterator.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.Iterator; - -public class KmerCodeIterator implements Iterator { - private byte[] m_seq; - private int m_last_offset; // offset of the last base to be added to current code when the next() shift occurs - private short m_code; - private short m_mask; // used to mask out all bits in m_code except the lowest 2*K bits used for kmers - - public KmerCodeIterator(String s, int K) { - assert K <= 8 : "Currently only Kmers of length K <=8 are supported"; - assert K > 0 : "Kmer length must be positive"; - m_seq = s.getBytes(); - m_last_offset = K-1; - m_mask = 0; - for ( int i = 0 ; i < K ; i++ ) { - m_mask <<= 2; - m_mask |= 0x03; - } - if ( K <= m_seq.length ) m_code = kmerCode(m_seq, 0, m_last_offset ); - // m_code now contains first K-1 bases encoded (the last, K-th base will be added when next() is called - } - - @Override - public boolean hasNext() { - return m_last_offset < m_seq.length ; - } - - @Override - public Short next() { - m_code <<= 2; - m_code |= toBaseCode(m_seq[m_last_offset]); - m_code &= m_mask; - m_last_offset++; - return m_code; - } - - @Override - public void remove() { - // TODO Auto-generated method stub - - } - - /** - * Converts base letter (character passed as byte: 'A', 'C', 'T', or 'G', case-insensitive) into - * numerical code according to A=0, C=1, G=2, T=3 (i.e. two bits per nucleotide) - * @param b - * @return - */ - private byte toBaseCode(byte b) { - // the following transformation is performed in the lines below: - // A,a->0, C,c->1, G,g->3, T,t->2; (we rely on the ASCII codes here!) - b >>= 1; - b &= 0x03; - // to get conventional base codes (A=0, C=1, G=2, T=3), we need to flip 3 (G) and 2 (T). - // In order to do that we xor the lowest bit - // with the second one (the latter is 1 for 2 and 3, but 0 for 0 and 1, so A anC will not be affected) - b ^= (b >> 1); - return b; - } - - /** Returns compact code that uniquely represents nucleotide sequence specified - * as an array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive. - * Sequence can not be longer than 8 nucleotides. - * @param s Nucleotide sequence - * @return unique code - */ -// private short kmerCode(byte [] s) { return kmerCode(s, 0, s.length); } - - /** Returns compact code that uniquely represents nucleotide sequence found in the interval - * [start,stop) of the specified array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive. - * Sequence can not be longer than 8 nucleotides. - * @param s Nucleotide sequence - * @param start index of the start position of the sub-sequence to be encoded - * @param stop next position after the last element of the sub-sequence to be encoded - * @return unique code - */ - private short kmerCode(byte [] s, int start, int stop) { - assert start <= stop : "Start position of the subsequence can not be greater than the stop position"; - assert start >= 0 : "Negative subsequence start positions are not allowed"; - assert stop <= s.length : "Stop position of the subsequence can not extend beyond the sequence end"; - short code = 0; - for ( int i = start ; i < stop ; i++ ) { - code <<= 2; - code |= toBaseCode(s[i]); - } - return code; - } - - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/MSAColumn.java b/archive/java/src/org/broadinstitute/sting/oldindels/MSAColumn.java deleted file mode 100755 index f95ff3a23..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/MSAColumn.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.List; -import java.util.ArrayList; - -public class MSAColumn { - private List mBytes; - - public MSAColumn() { - mBytes = new ArrayList(); - } - - /** Adds specified byte to the end of the column */ - public void add(char b) throws IllegalArgumentException { - if ( b == 'A' || b == 'C' || b == 'T' || b == 'G' || b == '-' || b==' ' || b == '*' || b=='N') { - mBytes.add(b); - } else { - throw new IllegalArgumentException("Invalid base letter passed to MSAColumn"); - } - } - - /** Removes first element from the column */ - public void removeFirst() throws IndexOutOfBoundsException { - mBytes.remove(0); - } - - /** Removes value at the specified position from the column */ - public void remove (int index) throws IndexOutOfBoundsException { - mBytes.remove(index); - } - - public int size() { return mBytes.size(); } - - public Character charAt(int offset) { return mBytes.get(offset); } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/Matrix.java b/archive/java/src/org/broadinstitute/sting/oldindels/Matrix.java deleted file mode 100755 index ae85ddae0..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/Matrix.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -public class Matrix { - private int nRows; - private int nCols; - private Object [][] data; - - /** Instantiates a generic matrix of objects with n rows and m cols. - * - * @param n number of rows - * @param m number of columns - */ - public Matrix(int n, int m) { - nRows = n; - nCols = m; - data = new Object[n][m]; - } - - /** Instantiates a square n x n matrix of objects. - * - * @param n size of the matrix - */ - public Matrix (int n) { - this(n,n); - } - - @SuppressWarnings("unchecked") - public T get(int i, int j) { - assert (i < nRows ) && (j < nCols) : "Matrix index is out of bounds"; - return (T) data[i][j]; - } - - public void set(int i, int j, T value) { - assert ( i < nRows ) && ( j < nCols ) : "Matrix index is out of bounds"; - data[i][j] = value; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/MultipleAlignment.java b/archive/java/src/org/broadinstitute/sting/oldindels/MultipleAlignment.java deleted file mode 100755 index fc9d635d7..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/MultipleAlignment.java +++ /dev/null @@ -1,337 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.*; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.Pair; - - -public class MultipleAlignment implements Iterable { - private static final int IMPOSSIBLE = 1000000000; - private Map index; // maps external id of the sequence onto its index in the pile - private List seqs; // sequences, in order they were added - private List ext_ids; // external ids of the sequences, in order they were added to the pile - private List alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added) - private int best_mm; // mismatch count - private int next_mm; // next-best mismatch count - private ConsensusSequence consensus; - - public MultipleAlignment() { - index = new HashMap(); - seqs = new ArrayList(); - alignment_offsets = new ArrayList(); - ext_ids = new ArrayList(); - consensus = new ConsensusSequence(); // we use reference position 0, e.g. we hook onto the first read in the pile - } - - public void clear() { - seqs.clear(); - index.clear(); - alignment_offsets.clear(); - ext_ids.clear(); - } - - /** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown - * - * @param seq sequence to add - * @param i id of the sequence (can be use later to query the pile) - * @see #add(String,int,int) - */ - public void add( String seq, int i ) throws IllegalStateException { - if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only"); - add(seq,i,0); - } - - /** Adds single sequence with id set to i and places it at the specified offset wrt the first sequence - * in this pile (i.e. wrt reference position 0). - * - * @param seq sequence to add - * @param i id of the sequence (can be use later to query the pile) - * @see #add(String,int) - */ - public void add( String seq, int i, int offset ) throws IllegalStateException { - index.put(i,index.size()); - ext_ids.add(i); - seqs.add(seq); - alignment_offsets.add(offset); - consensus.addSequence(seq,offset); - } - - public void add( PairwiseAlignment a) { - if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set"); - add(a,a.id1(),a.id2()); - } - - /** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j, - * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty, exactly - * one of the sequences held by the pair-wise alignment should be already in the pile; this sequence (and the - * pairwise alignment itself) will be used to stitch the other sequence to the pile. If either both or - * none of the specified ids are already in the pile, an IllegalStateException will be thrown. - * @param a - * @param i - * @param j - */ - public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException { - if ( seqs.size() == 0 ) { - add(a.getSequence1(),i,0); - add(a.getSequence2(),j,a.getBestOffset2wrt1()); - return; - } - - Integer first = index.get(i); - Integer second = index.get(j); - - if ( first != null && second != null ) { - throw new IllegalStateException("Attempt to add pairwise alignment for two sequences that are already in the pile"); - } - - if ( first == null && second == null ) { - throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile"); - } - - if ( second == null ) add(a.getSequence2(),j, a.getBestOffset2wrt1() + alignment_offsets.get( first ) ); - else add(a.getSequence1(),i, -a.getBestOffset2wrt1() + alignment_offsets.get( second ) ); - } - - /** Adds another pile of aligned sequences to this pile, stitching them together using specified pairwise alignment - * p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in - * the pile being added, otherwise an IllegalArgumentException is thrown. Sequence id's i and j MUST be the ids - * of the first and second sequences in the pairwise alignment, in that order. Specified ids override - * ids, if any, set for the sequences in the pairwise alignment; it is not checked whether the specified and - * stored ids match. The piles can not overlap. - */ - public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) { - int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile - if ( this.contains(i) ) { - if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile"); - off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j); - } else { - if ( this.contains(j)) { - if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile"); - off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i); - } else throw new IllegalArgumentException("Sequence is not in the pile"); - } - // stitch sequences from a into this pile: - for ( Integer id : a ) { - if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one"); - add(a.getSequenceById(id),id,off2+a.getOffsetById(id)); - } - } - - - /** Adds another pile of aligned sequences (a) to this pile, stitching them together using specified - * pairwise alignment p. Sequence ids must be set in the pairwise alignment, and one of those ids - * must be in this pile, and the other in the pile 'a' being added, otherwise an IllegalArgumentException - * is thrown. If pairwise alignment does not have sequence ids set, IllegalArgumentException is thrown. - * The piles can not overlap. - */ - public void add(MultipleAlignment a, PairwiseAlignment p) { - if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set"); - add(a,p,p.id1(),p.id2()); - } - - /** Returns sequence associated with the specified external id, or null if sequence with this external id is - * not found in the pile - * - * @param id query id - * @return sequence for specified id or null - */ - public String getSequenceById(int id) { - if ( ! contains(id)) return null; - return seqs.get(index.get(id)); - } - - /** Returns offset relative to the first sequence in the pile for sequence associated with the specified - * external id. If sequence with specified id is not found in the pile, RuntimeException is thrown. - * - * @param id query id - * @return offset for sequence with specified id - */ - public int getOffsetById(int id) { - if ( ! contains(id) ) throw new RuntimeException("Specified id is not in the pile"); - return alignment_offsets.get(index.get(id)); - } - - /** Returns external id of the read the offsets of this multiple alignment are based upon (i.e. all the offsets - * are specified wrt the base read). - * @return - */ - public int getBaseReadId() { return ext_ids.get(0); } - - /** Returns offset of the read specified by its external id wrt the start of the consensus sequence in this - * multiple alignment (consenus sequence is a major vote union of all the reads in this alignment). - * @param id - * @return - */ - public int getOffsetWrtConsensus(int id) { - return getOffsetById (id)- consensus.getStartOffset(); - } - - /** Returns true if the alignment already contains sequence with the specified id. - * - * @param id - * @return - */ - public boolean contains(int id) { - return index.containsKey(id); - } - - /** Returns number of mismatches between sequences i and j (external ids) in the currently held multiple alignment. - * Will return 0 if sequences do not overlap. Will throw RuntimeException if any of the specified ids is not - * found in the current pile. - * @param i id of the first sequence - * @param j id of the second sequence - * @return mismatch count - * - * */ - public int countMismatches(int i, int j) { - return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i)); - } - - /** Returns the length of the overlapping region of the two sequences specified by their external ids i and j. - * - * @return overlap size - */ - public int getOverlap(int i, int j) { - if ( ! contains(i) || ! contains(j) ) throw new RuntimeException("Sequence with specified id is not in MSA pile"); - int off = getOffsetById(j) - getOffsetById(i); - int L; - if ( off >= 0 ) L = Math.min(getSequenceById(i).length()-off, getSequenceById(j).length()); - else L = Math.min(getSequenceById(j).length()+off, getSequenceById(i).length()); - return ( L < 0 ? 0 : L ); - } - - /** Given the two sequence ids, one of which has to be already in the pile, returns the one that is not in the pile. - * - * @param i sequence id - * @param j sequence id - * @return one of the input arguments that is not found in the pile - * @throws IllegalArgumentException when either both or none of the specified indices are in the pile - */ - public int selectExternal(int i, int j) { - if ( contains(i) ) { - if ( contains(j) ) throw new IllegalArgumentException("Can not select external when both indices are in the pile"); - return j; - } else { - if ( ! contains(j) ) throw new IllegalArgumentException("Attempt to select external when both indices are not in the pile"); - return i; - } - } - - /** Returns a string consisting of n spaces. - * - * @param n - * @return - */ - private String skipN(int n) { - StringBuilder b=new StringBuilder(); - for ( int k = 0 ; k < n ; k++ ) b.append(' '); - return b.toString(); - } - - /** Prints n spaces directly into the specified string builder. - * - * @param n - * @param b - */ - private void skipN(int n, StringBuilder b) { - for ( int k = 0 ; k < n ; k++ ) b.append(' '); - } - - /** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately - * shifted and ready for printout; - */ - public String toString(boolean inorder, boolean dotprint) { - - StringBuilder b = new StringBuilder(); - java.util.Formatter frmt = new java.util.Formatter(b); - - if ( seqs.size() == 0 ) return b.toString(); - - final int first_offset = -consensus.getStartOffset(); - - final int msa_length = consensus.length(); - char[][] consensusString = new char[4][msa_length]; - - for ( int i = 0 ; i < msa_length ; i++ ) { - - Pair base = consensus.baseWithCountAt(i-first_offset); - consensusString[3][i] = base.first; - int mm = consensus.coverageAt(i-first_offset) - base.second; - if ( mm > 0 ) { - consensusString[2][i] = '*'; - if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10); - else consensusString[0][i] = ' '; - consensusString[1][i] = Character.forDigit(mm%10,10); - } else { - consensusString[0][i] = consensusString[1][i] = consensusString[2][i] = ' '; - } - } - - b.append(" "); b.append(consensusString[0]); b.append('\n'); - b.append(" "); b.append(consensusString[1]); b.append('\n'); - b.append(" "); b.append(consensusString[2]); b.append('\n'); - b.append(" "); b.append(consensusString[3]); b.append('\n'); - - Integer[] perm = null; - if ( inorder ) perm = Utils.SortPermutation(alignment_offsets); - - for ( int i = 0 ; i < seqs.size() ; i++ ) { - int index = (inorder ? perm[i] : i); - frmt.format("%3d:", ext_ids.get(index)); - int pos = alignment_offsets.get(index)+ first_offset; // start position on the consensus sequence - skipN(pos,b); - String aSeq = seqs.get(index); - if ( dotprint ) { - for ( int j = 0 ; j < aSeq.length() ; j++, pos++ ) { - if ( Character.toUpperCase(aSeq.charAt(j)) == - Character.toUpperCase(consensusString[3][pos]) ) b.append('.'); - else b.append(aSeq.charAt(j)); - } - } else b.append(aSeq); - b.append('\n'); - } -// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n"); - return b.toString(); - } - - public String getConsensus() { - return consensus.getSequence(); - } - - public String toString() { return toString(true, false); } - - public int size() { return seqs.size(); } - - /** Returns an iterator over the id's of the sequences currently stored in the pile - * - * @return - */ - public Iterator sequenceIdIterator() { return index.keySet().iterator(); } - - /** Returns an iterator over external seuqnce ids of the sequences stored in the pile, presenting them in - * the order of ascending alignment offsets. - * @return - */ - public Iterator sequenceIdByOffsetIterator() { - final Integer[] perm = Utils.SortPermutation(alignment_offsets); - return new Iterator() { - private int i = 0; - public boolean hasNext() { - return i < perm.length; - } - public Integer next() { - return ext_ids.get(perm[i++]); - } - public void remove() { - throw new UnsupportedOperationException("remove not supported"); - } - } ; - - } - - public Iterator iterator() { - return sequenceIdIterator(); - } - - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/PairwiseAlignment.java b/archive/java/src/org/broadinstitute/sting/oldindels/PairwiseAlignment.java deleted file mode 100755 index 23bc37005..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/PairwiseAlignment.java +++ /dev/null @@ -1,216 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -public class PairwiseAlignment { - private static final int IMPOSSIBLE = 1000000000; - private String s1; - private String s2; - private int i1; // (external) id of the first sequence - private int i2; // (external) id of the second sequence - private int alignment_offset; // offset of s2 w/respect to s1 - private int best_mm; // mismatch count - private int next_mm; // next-best mismatch count - - /** Initializes the alignment with pair of sequences (that will be immediately aligned) and - * stores their specified external ids id1, id2. - * @param is1 first nucleotide sequence (pre-indexed) - * @param is2 second nucleotide sequence (pre-indexed) - * @param id1 external id of the first sequence - * @param id2 external id of the second sequence - */ - public PairwiseAlignment(IndexedSequence is1, IndexedSequence is2, int id1, int id2 ) { - s1 = new String(is1.getSequence()); - s2 = new String(is2.getSequence()); - i1 = id1; - i2 = id2; - best_mm = IMPOSSIBLE; - next_mm = IMPOSSIBLE; - align(is1,is2); - } - - /** Initializes the alignment with pair of sequences (that will be immediately aligned) and - * sets their external ids to -1. Such un-annotated pairwise alignment can not be added to MultipleAlignment. - * - */ - public PairwiseAlignment(IndexedSequence is1, IndexedSequence is2) { - this(is1,is2,-1,-1); - } - - /** - * Returns offset of sequence 2 with respect to sequence 1 in the best alignment - * @return positive offset if s2 is shifted right (starts later) wrt s1, or negative offset - * if s2 is shifted left (starts earlier) wrt s1 - */ - public int getBestOffset2wrt1() { return alignment_offset; } - - /** Returns offset of the sequence j wrt sequence i in the best pairwise alignment found. - * - * @param i extrenal id of a sequence, must be one of the sequences kept by this alignment - * @param j extrenal id of a sequence, must be one of the sequences kept by this alignment - * @return offset of 2nd arg (j) wrt to the first arg (i) - */ - public int getBestOffset2wrt1(int i, int j ) { - if ( i == i1 && j == i2 ) return alignment_offset; - else if ( i == i2 && j == i1 ) return -alignment_offset; - throw new RuntimeException("Specified sequence id not found in the alignment"); - } - - public String getSequence1() { return s1; } - public String getSequence2() { return s2; } - public String getSequenceById(int i) { - if ( i == i1 ) return s1; - else if ( i == i2 ) return s2; - throw new RuntimeException("Specified sequence id not found in the alignment"); - } - public int id1() { return i1;} - public int id2() { return i2;} - - /** Returns mismatch count in the best alignment found. - * - * @return count of mismatches or impossibly large number of no mismatches were found - */ - public int getBestMMCount() { return best_mm; } - - /** Returns the number of mismatches in the next-best alignment found - * - * @return next-best count of mismatches or impossibly large number if at most one alignment - * was ever found (that one would make the best then) - */ - public int getNextBestMMCount() { return next_mm; } - - /** Returns the length of the overlapping region of sequences s1 and s2 in the best alignment found, or -1 if - * sequences do not align. - * - * @return overlap size; can not be smaller than the size of the kmer used in IndexedSequence arguments the - * alignment was built from - */ - public int getOverlap() { - if ( ! alignmentExists() ) return -1; - if ( alignment_offset >= 0 ) { - return Math.min(s1.length()-alignment_offset, s2.length()); - } else { - return Math.min(s2.length()+alignment_offset, s1.length()); - } - } - - public static int getOverlap(String seq1, String seq2, int offset2wrt1) { - int L ; - if ( offset2wrt1 >= 0 ) { - L = Math.min(seq1.length()-offset2wrt1, seq2.length()); - } else { - L = Math.min(seq2.length()+offset2wrt1, seq1.length()); - } - return ( L < 0 ? 0 : L ); - } - - /** Returns true if at least one alignment, no matter how bad, was found between the two sequences - * (i.e. the sequences have at least one kmer in common). - */ - public boolean alignmentExists() { return best_mm < IMPOSSIBLE; } - - public void align(IndexedSequence is1, IndexedSequence is2) { - - Set offsets = new HashSet() ; // possible offsets of s2 wrt s1 as suggested by matching kmers - for ( Map.Entry> e : is1 ) { // for each kmer in s1 - List kmer_offsets_2 = is2.getOffsets(e.getKey()); - if ( kmer_offsets_2 == null ) continue; // uh-oh, kmer is not found in the other sequence - for ( Integer i1 : e.getValue() ) { - for ( Integer i2 : kmer_offsets_2 ) { - offsets.add(i1-i2); // offset of seq 2 wrt seq1 as suggested by the currently inspected occurences of the same kmer e.getKey() in both sequences - } - } - } - // we have now a collection of distinct s1-s2 offsets seeded by matching kmers. - // lets extend these kmer matches and count mismatches: - - for ( Integer trial_offset : offsets ) { - int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm+1); -// if ( (i1==4||i1==8) && i2==18) { -// if ( i1== 18 ) System.out.print("to " + i2+" : "); -// else System.out.print("to " + i1+" : "); -// System.out.println("offset="+trial_offset.toString()+ -// "; mm=" + countMismatches(is1.getSequence(),is2.getSequence(),trial_offset)+ -// "(mm_cnt="+mm_cnt+")"+ -// "; dist="+distance(is1.getSequence(),is2.getSequence(),trial_offset)+ -// "; overlap="+getOverlap(is1.getSequence(),is2.getSequence(),trial_offset)); -// } - // save current offset if alignment at this offset has fewer mismatches tham everything we've - // seen so far, or if it has same number of mismatches but has larger overlap (i.e. distance - // between sequences is smaller) - if ( mm_cnt < best_mm || - ( ( mm_cnt == best_mm ) && - getOverlap(is1.getSequence(),is2.getSequence(),alignment_offset) < - 0.8*getOverlap(is1.getSequence(),is2.getSequence(),trial_offset) ) ) { -// if ( (i1==4||i1==8) && i2==18) System.out.println("Saved offset "+trial_offset.toString()); - alignment_offset = trial_offset; - next_mm = best_mm; - best_mm = mm_cnt; - } else { - if ( mm_cnt < next_mm ) next_mm = mm_cnt; - } - } - } - - public static int countMismatches(String seq1, String seq2, int offset2wrt1) { - int pos1 = ( offset2wrt1 >= 0 ? offset2wrt1 : 0 ); - int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); - int cnt = 0; - while ( pos1 < seq1.length() && pos2 < seq2.length() ) { - if ( Character.toUpperCase(seq1.charAt(pos1++)) == - Character.toUpperCase(seq2.charAt(pos2++)) ) continue; - cnt++; // found mismatch - } - return cnt; - } - - public static int countMismatches(String seq1, String seq2, int offset2wrt1, int maxerr) { - int pos1 = ( offset2wrt1 >= 0 ? offset2wrt1 : 0 ); - int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); - int cnt = 0; - while ( pos1 < seq1.length() && pos2 < seq2.length() && cnt < maxerr ) { - if ( Character.toUpperCase(seq1.charAt(pos1++)) == - Character.toUpperCase(seq2.charAt(pos2++)) ) continue; - cnt++; // found mismatch - } - return cnt; - } - - /** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately - * shifted and ready for printout; the pairwise alignment is followed by a stats line - */ - public String toString() { - StringBuffer b = new StringBuffer(); - int skip1 = ( alignment_offset >= 0 ? 0 : -alignment_offset ); - int skip2 = ( alignment_offset >=0 ? alignment_offset : 0 ); - for ( int k = 0 ; k < skip1 ; k++ ) b.append(' '); - b.append(s1); - b.append('\n'); - for ( int k = 0 ; k < skip2 ; k++ ) b.append(' '); - b.append(s2); - b.append('\n'); - b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n"); - return b.toString(); - } - - public double distance() { - int L = getOverlap(); - if ( L <=0 ) return 1e100; - double l = ( best_mm==0? 1.0 : (double)best_mm + Math.sqrt((double)best_mm) ); - return ( l / (double)L ); - } - - public static double distance(String seq1, String seq2, int offset2wrt1) { - int L = getOverlap(seq1,seq2,offset2wrt1); - if ( L <= 0 ) return 1e100; - int mm = countMismatches(seq1,seq2,offset2wrt1); - double l = ( mm == 0 ? 1.0 : (double)mm + Math.sqrt((double)mm) ); - return ( l / (double) L ); - } - -} - - diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/PassThroughWriter.java b/archive/java/src/org/broadinstitute/sting/oldindels/PassThroughWriter.java deleted file mode 100644 index 98cdf403e..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/PassThroughWriter.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMFileHeader; - -import java.io.File; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 25, 2009 - * Time: 8:27:09 PM - * To change this template use File | Settings | File Templates. - */ -public class PassThroughWriter implements RecordReceiver { - private SAMFileWriter writer; - private int reads_written = 0; - - public PassThroughWriter( File f, SAMFileHeader h) { - writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, false, f); - } - - public PassThroughWriter(String s, SAMFileHeader h) { - this(new File(s), h); - } - - public void receive(SAMRecord r) { - //To change body of implemented methods use File | Settings | File Templates. - writer.addAlignment(r); - reads_written++; - } - - public void close() { writer.close() ; } - - /** Returns the number of reads that were so far received by this writer. - * - */ - public int getNumReadsReceived() { return reads_written; } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/PileBuilder.java b/archive/java/src/org/broadinstitute/sting/oldindels/PileBuilder.java deleted file mode 100755 index c8647628b..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/PileBuilder.java +++ /dev/null @@ -1,1005 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.*; - -import java.util.*; -import java.io.File; - -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.playground.utils.CountedObject; -import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter; - - -public class PileBuilder implements RecordPileReceiver { - private SymmetricMatrix distances ; - private Matrix alignments ; - private static final int KmerSize = 8; - private MultipleAlignment alignments1; - private MultipleAlignment alignments2; - private String referenceSequence; - private int reference_start; - - private int processed_piles = 0; - private int improved_piles = 0; - private int unmodified_piles = 0; - private int failed_piles = 0; - private int indels_improved = 0; - private int indel_improvement_cnt = 0; - private int indels_discarded = 0; - private int indels_added = 0; - private int indels_added_cnt = 0; - private int total_mismatches_count_in_improved = 0; - private int total_mismatches_count_in_failed = 0; - private int total_improved_mismatches_count = 0; - private int total_reads_in_improved = 0; - private int total_reads_in_failed = 0; - private int total_alignments_modified = 0; - - private int total_reads_received = 0; - private int total_reads_written = 0; - - public final static int SILENT = 0; - public final static int PILESUMMARY = 1; - public final static int ALIGNMENTS = 2; - - private int mVerbosityLevel = SILENT; - - private SAMFileWriter samWriter; - private RecordReceiver failedPileReceiver; - - private static class SelectedPair { - private int i_; - private int j_; - private double d_; - - private SelectedPair(int i, int j, double d) { - set(i,j,d); - } - - private SelectedPair() { - set(-1,-1,1e100); - } - - private double d() { return d_; } - private int i() { return i_; } - private int j() { return j_; } - - private void set(int i, int j, double d) { - i_ = i; - j_ = j; - d_ = d; - } - - /** Returns true if any of the two indices kept by this pair is equal to i. - * - * @param i - * @return - */ - private boolean contains(int i) { - return ( ( i_ == i ) || ( j_ == i ) ); - } - - } - - public class SelectedSequence { - private int id_; - private double d_; - - private SelectedSequence(int i, double d) { - set(i,d); - } - - private SelectedSequence() { this(-1,1e100) ; } - private void set(int i, double d) { id_ = i; d_ = d; } - - public double d() { return d_;} - public int i() { return id_; } - - } - - public PileBuilder(File f, SAMFileHeader h, RecordReceiver fr) { - samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h,false,f); - referenceSequence = null; - reference_start = -1; - failedPileReceiver = fr; - } - - public PileBuilder(String s, SAMFileHeader h, RecordReceiver fr) { - this(new File(s),h, fr); - } - - public void setReferenceSequence(String seq, int start) { - referenceSequence = seq; - reference_start = start; - } - - public void setReferenceSequence(String seq) { - referenceSequence = seq; - reference_start = -1; - } - - /** Returns the number of reads that were so far received by this writer. - * - */ - public int getNumReadsReceived() { return total_reads_received; } - - /** Returns the number of reads that were so far written by this writer (NOT sent - * into its secondary "failed mode" receiver!) - * - */ - public int getNumReadsWritten() { return total_reads_written; } - - public void receive(Collection c) { - int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref - int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end) - for ( SAMRecord r : c ) { - startOnRef = Math.min(startOnRef, r.getAlignmentStart() ); - stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd()); - } - - // part of the reference covered by original alignments: - String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef); - receive(c, pileRef, startOnRef); - } - - public void receive(Collection c, String pileRef, int startOnRef) { - - //TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup. - processed_piles++; - total_reads_received += c.size(); - - IndexedSequence[] seqs = new IndexedSequence[c.size()]; - int i = 0; - for ( SAMRecord r : c ) { - seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize); - } - - int totalMismatches = 0; // total mismatches across all reads - TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >( - new CountedObjectComparatorAdapter(new IntervalComparator())); - - SequencePile originalAligns = null; - if ( mVerbosityLevel >= ALIGNMENTS ) originalAligns = new SequencePile(pileRef); - - for ( SAMRecord r : c ) { - if ( mVerbosityLevel >= ALIGNMENTS ) { - originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), - r.getCigar(), r.getAlignmentStart() - startOnRef ); - } - totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence); - //AlignmentUtils.collectAndCountIndels(r,all_indels); - } - - if ( mVerbosityLevel >= ALIGNMENTS ) { - System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); - System.out.println("ORIGINAL ALIGNMENT: \n"); - originalAligns.dotprint(true); - System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; - } - - List piles = doMultipleAlignment2(seqs); - - if ( piles.size() > 2 ) { - System.out.println("WARNING: " + piles.size() + " piles appear to be disjoint"); - } - -// System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); -// System.out.print("; diameter of PILE1: "+ diameter(alignments1)); -// System.out.println("; diameter of PILE2: "+ diameter(alignments2)); - - SymmetricMatrix d = new SymmetricMatrix(piles.size()); - for ( int n = 0 ; n < piles.size() ; n++ ) { - d.set(n,n,diameter(piles.get(n))); - for ( int m = n+1 ; m < piles.size() ; m++ ) { - d.set(n,m,distance(piles.get(n), piles.get(m))); - } - } - - int new_mismatches = 0 ; // number of mismatches after re-alignment: - TreeSet< CountedObject > new_indels = new TreeSet< CountedObject >( - new CountedObjectComparatorAdapter(new IntervalComparator()) - ); // new indels after realignment - int shifted_reads = 0; - int smashed_reads = 0; - - List as_list = (List)c; // ugly hack; need this to access records by ids - - if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(d.format("%8.4g")); - - for ( int n = 0 ; n < piles.size() ; n++ ) { -// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0); - SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5); - - if ( mVerbosityLevel >= ALIGNMENTS ) { - - System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):"); - System.out.println(consToRef.toString()); - System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString(true,true)); - } -// SequencePile pileAligns = new SequencePile(pileRef); - - MultipleAlignment ma = piles.get(n); - for ( Integer id : ma ) { - SAMRecord r = as_list.get(id); - int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq - -/* - System.out.println("id=" + id +": offset on consensus="+cons_offset+ - "; consensus wrt ref chunk="+consToRef.getAlignmentStart2wrt1()+"; chunk start="+startOnRef); -*/ - - int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar()); - if ( ref_offset != r.getAlignmentStart()) shifted_reads++; - Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar()); -/* - if ( id == 9 ) { - System.out.println("ref_offset="+ref_offset+"; orig_ref_off="+r.getAlignmentStart()+"; "+ - AlignmentUtils.toString(cig)); - } - - System.out.println("adding "+id+" at "+ (ref_offset - refStarttemp)); - pileAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), cig, ref_offset - refStarttemp); -*/ - if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++; - - if ( ref_offset != r.getAlignmentStart() || cig.numCigarElements() != r.getCigar().numCigarElements() ) total_alignments_modified++; - - SAMRecord rtest = new SAMRecord(r.getHeader()); - rtest.setAlignmentStart(ref_offset); - rtest.setReadString(r.getReadString()); - rtest.setReadUmappedFlag(r.getReadUnmappedFlag()); - rtest.setCigar(cig); - //AlignmentUtils.collectAndCountIndels(rtest,new_indels); - new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence); - } - // pileAligns.colorprint(true); - } - - boolean pile_improved = false; - boolean pile_unmodified = false; - boolean pile_failed = false; - - double mmChangePct = Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches); - - if ( shifted_reads == 0 && smashed_reads == 0 ) pile_unmodified = true; - else { - if ( new_mismatches < totalMismatches || - mmChangePct < 10.0 && ( new_indels.size() < all_indels.size() ) - ) pile_improved = true; - else pile_failed = true; - } - - if ( pile_improved ) { - total_mismatches_count_in_improved +=totalMismatches; - total_improved_mismatches_count += new_mismatches; - total_reads_in_improved += c.size() ; - } - - if ( pile_failed ) { - total_mismatches_count_in_failed += totalMismatches; - total_reads_in_failed += c.size(); - } - int discovered_indels = 0; - int discovered_support = 0; - int existing_indels = 0; - int existing_support = 0; - int existing_support_new = 0; - int discarded_indels = 0; - for ( CountedObject ind : new_indels ) { - //System.out.print("new indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop()); - if ( ! all_indels.contains(ind) ) { - //System.out.println(" (DISCOVERED)"); - discovered_indels++; - discovered_support += ind.getCount(); - if ( pile_improved ) { - indels_added++; - indels_added_cnt += ind.getCount(); - } - } else { - //System.out.println(" (EXISTING)"); - existing_indels++; - existing_support_new += ind.getCount(); - if ( pile_improved && ( ind.getCount() > all_indels.floor(ind).getCount() ) ) { - if ( ! ind.equals(all_indels.floor(ind))) System.out.println("ERROR MATCHING INDELS!!!") ; - indels_improved++; - indel_improvement_cnt += ( ind.getCount() - all_indels.floor(ind).getCount() ); - } - } - } - for ( CountedObject ind : all_indels ) { - //System.out.print("old indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop()); - if ( ! new_indels.contains(ind )) { - //System.out.println(" (DISCARDED)"); - discarded_indels++; - if ( pile_improved ) indels_discarded++; - } else { - //System.out.println(" (KEPT)"); - existing_support += ind.getCount(); - } - } - - if ( pile_improved ) improved_piles++; - if ( pile_unmodified ) unmodified_piles++; - if ( pile_failed ) failed_piles++; - - if ( mVerbosityLevel >= PILESUMMARY ) { - System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches); - if ( totalMismatches > new_mismatches ) System.out.print("(-"); - else System.out.print("(+"); - System.out.printf("%.2f%%)%n",mmChangePct); - - System.out.println("CONFIRMED INDELS: "+existing_indels); - System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new ); - if ( existing_support > existing_support_new ) System.out.print("(-"); - else System.out.print("(+"); - System.out.printf("%.2f%%)%n",Math.abs((existing_support- existing_support_new)*100.0/existing_support)); - System.out.println("DROPPED INDELS: " + discarded_indels); - System.out.println("DISCOVERED INDELS: " + discovered_indels) ; - System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support); - System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads); - System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads); - - if ( pile_improved ) System.out.println("OUTCOME: IMPROVED"); - if ( pile_unmodified ) System.out.println("OUTCOME: UNCHANGED"); - if ( pile_failed ) System.out.println("OUTCOME: FAILED"); - - System.out.println("\n#############################################################################\n"); - } - // finally, writing stuff: - for ( int n = 0 ; n < piles.size() ; n++ ) { - - SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5); - MultipleAlignment ma = piles.get(n); - - Iterator id_iter = ma.sequenceIdByOffsetIterator(); - while ( id_iter.hasNext() ) { - - int id = id_iter.next(); - - SAMRecord r = as_list.get(id); - if ( pile_failed ) { - failedPileReceiver.receive(r); // nothing to do, send failed piles directly for writing - continue; - } - - if ( pile_unmodified ) { - samWriter.addAlignment(r); // nothing to do, "cleaned" pile has not changed, so we just write all reads back - continue; - } - - // we improved stuff!! let's reset the alignment parameters! - int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq - - // offset of the realigned read r on the reference - int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar()); - - r.setAlignmentStart(ref_offset); - - Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar()); - - r.setCigar(cig); - - r.setAttribute("NM",new Integer(AlignmentUtils.numMismatches(r,referenceSequence))); - - // System.out.println("writing " + id); - samWriter.addAlignment(r); - total_reads_written++; - - } - } - - } - - public void close() { samWriter.close(); } - - public double pct (int i, int t) { - return ((double)i*100.0/((double)t)); - } - - public void printStats() { - System.out.println("\n---------------------------------------------------------------------------------"); - System.out.println("Piles processed: "+ processed_piles); - System.out.printf("Piles improved: %d (%.2f%%)%n", improved_piles,pct(improved_piles,processed_piles)); - System.out.printf("Piles confirmed (unchanged): %d (%.2f%%)%n", unmodified_piles,pct(unmodified_piles,processed_piles)); - System.out.printf("Piles failed: %d (%.2f%%)%n", failed_piles,pct(failed_piles,processed_piles)); - System.out.println("In improved piles:"); - System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_improved, - (double)total_reads_in_improved/(double)improved_piles,(double) total_mismatches_count_in_improved /(double)total_reads_in_improved); - System.out.printf(" Overall mismatch count: %d --> %d (%.2f%%)%n", total_mismatches_count_in_improved,total_improved_mismatches_count, - pct(total_improved_mismatches_count- total_mismatches_count_in_improved, total_mismatches_count_in_improved)); - System.out.printf(" Mismatch improvement: suppressed %.2f mm/read%n", - (double)(total_mismatches_count_in_improved -total_improved_mismatches_count)/(double)total_reads_in_improved ); - System.out.printf(" Alignments modified: %d (%.2f%% of total or %.2f per pile)%n",total_alignments_modified, - pct(total_alignments_modified,total_reads_in_improved),(double)total_alignments_modified/(double)improved_piles); - System.out.printf(" Improved indels: %d (%.2f per pile) with %.3f additional reads per indel%n", - indels_improved,(double)indels_improved/(double)improved_piles,(double)indel_improvement_cnt/(double)indels_improved); - System.out.printf(" New indels: %d (%.2f per pile) with %.3f reads per indel%n", - indels_added,(double)indels_added/(double)improved_piles,(double)indels_added_cnt/(double)indels_added); - System.out.printf(" Discarded indels: %d (%.2f per pile)%n", - indels_discarded,(double)indels_discarded/(double)improved_piles); - System.out.println("In failed piles:"); - System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_failed, - (double)total_reads_in_failed/(double)failed_piles,(double) total_mismatches_count_in_failed /(double)total_reads_in_failed); - System.out.println("---------------------------------------------------------------------------------\n"); - - } - - public void setVerbosity(int v) { - mVerbosityLevel = v; - } - /** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X, - * and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct - * alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the - * indels will be inserted into the new cigar for read-Y alignment). - * @param s - * @param l - * @param baseCigar - * @return - */ - private Cigar buildCigar(int s, int l, Cigar baseCigar) { - - int refpos = 0; - - List lce = new ArrayList(5); // enough to keep 2 indels. should cover 99.999% of cases... - - CigarElement celem = null; - int i = 0; - while ( refpos <= s ) { - celem = baseCigar.getCigarElement(i); - if ( celem.getOperator() != CigarOperator.D ) refpos+=celem.getLength(); - i++; - } - // we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element - - lce.add( new CigarElement(Math.min(refpos-s,l),celem.getOperator()) ); - - while ( refpos < s+l ) { - celem = baseCigar.getCigarElement(i); - // System.out.print("ref="+refpos+",s+l="+(s+l)+"len="+celem.getLength()+":"); - lce.add( new CigarElement(Math.min(celem.getLength(),l + s - refpos), celem.getOperator()) ); - if ( celem.getOperator() != CigarOperator.D ) refpos += celem.getLength(); - i++; - } - return new Cigar(lce); - } - - private int indelCorrection(int offset, Cigar cig) { - int correction = 0; - for ( int i = 0 ; i < cig.numCigarElements() && offset > 0 ; i++ ) { - CigarElement ce = cig.getCigarElement(i); - switch ( ce.getOperator() ) { - case M: offset -= ce.getLength() ; break; - case I: - if ( offset >= ce.getLength() ) correction-= ce.getLength(); - else correction -= offset; - offset -= ce.getLength(); - break; - case D: correction+=ce.getLength(); - break; - } - } - return correction; - } - - public void initPairwiseAlignments( IndexedSequence [] seqs ) { - distances = new SymmetricMatrix( seqs.length ); - alignments = new Matrix( seqs.length ); - for( int i = 0; i < seqs.length ; i++ ) { - for ( int j = i+1 ; j < seqs.length ; j++ ) { - PairwiseAlignment a = new PairwiseAlignment(seqs[i],seqs[j],i,j); // compute pairwise alignment - alignments.set(i, j, a); // save it - alignments.set(j, i, a); // save it - distances.set(i,j,a.distance()); - } - } - } - - /** Finds the best pairwise alignment across all available ones. The object must be initialized first, - * so that the alignments are pre-computed. - * @return id's of the two sequences and the distance between them in a compound object. - */ - public SelectedPair findClosestPair() { - - SelectedPair p = new SelectedPair(-1,-1,1e100); - - for( int i = 0; i < distances.size() ; i++ ) { - for ( int j = i+1 ; j < distances.size() ; j++ ) { - double d = distances.get(i,j); - if ( d < p.d() ) p.set(i,j,d); - } - } - return p; - } - - /** Finds the worst pairwise alignment across all available ones. The object must be initialized first, - * so that the alignments are pre-computed. - * @return id's of the two sequences and the distance between them in a compound object. - */ - public SelectedPair findWorst() { - - SelectedPair p = new SelectedPair(-1,-1,-1.0); - - for( int i = 0; i < distances.size() ; i++ ) { - for ( int j = i+1 ; j < distances.size() ; j++ ) { - double d = distances.get(i,j); - if ( d > p.d() ) p.set(i,j,d); - } - } - return p; - } - - - /** Finds the best pairwise alignment across all available ones, subject to the constraint that neither - * of the two sequences found can be listed (by its id) in the supplied SelectedPair object. If the best pair is passed - * as an argument, this method will find the next best pair. - * - * @param pexclude neither of the two sequences in the returned pair can have its id listed in pexclude pair. - * @return Best pairwise alignment excluding alignments between pairs involving at least one sequence from pexclude - */ - public SelectedPair findNextClosestPairAfter(SelectedPair pexclude) { - - SelectedPair p = new SelectedPair(-1,-1,1e100); - - for( int i = 0; i < distances.size() ; i++ ) { - if ( pexclude.contains(i) ) continue; - for ( int j = i+1 ; j < distances.size() ; j++ ) { - if ( pexclude.contains(j)) continue; - double d = distances.get(i,j); - if ( d < p.d() ) p.set(i,j,d); - } - } - return p; - } - - /** Finds the closest sequence to the specified pile among all sequences, which are not yet in that pile. Being - * the 'closest' is defined in terms of minimum distance. - * - * @param a alignment pile to find the closest sequence for - * @return a compound SelectedPair object that contains the index of the closest sequence found (is guaranteed to - * be not in the pile), the index of the sequence in the pile it is closest to, and the actual distance between the two. - */ - public SelectedPair findClosestToPile(MultipleAlignment a) { - - SelectedPair p = new SelectedPair(-1,-1,1e100); - - for ( Integer id : a ) { - for (int i = 0; i < distances.size(); i++) { - if (a.contains(i)) continue; // a already contains both sequences (i,id) - double d = distances.get(i, id); - if (d < p.d() ) p.set(i,id,d); - } - } - return p; - } - - public SelectedPair findClosestToPileAverage(MultipleAlignment a) { - - SelectedPair p = new SelectedPair(-1,-1,1e100); - - // currently, we compute the average distance from each sequence to the pile, but if the average - // distance is small enough, we will try to stitch that sequence to the pile based on the *best* - // available pairwise alignment, best_id will keep the id of that sequence from the pile that - // has the best alignment with the sequence that is the closest on average - int best_id=-1; - - Set offsets = new HashSet(); // all putative offsets suggested by different p-wise alignments - for ( int i = 0 ; i < distances.size() ; i++ ) { // for each sequence i - - if ( a.contains(i) ) continue; // sequence i is already in the pile, ignore it - - offsets.clear(); - - for ( Integer id : a ) { // for all sequences from the msa pile - PairwiseAlignment pa = alignments.get(i,id); - if ( pa.getOverlap() <= 0 ) continue; // at this step we do not take into account sequences with no overlap - // alignment pa suggests this offset of i wrt the first sequence in the msa - offsets.add( pa.getBestOffset2wrt1(id,i)+a.getOffsetById(id)); - } - // we got all suggested offsets; now lets recompute distances: - - for( Integer off : offsets ) { - SelectedPair spo = averageDistanceForOffset(a,i,off); - if ( spo.d() < p.d() ) p.set(spo.i(),spo.j(),spo.d()); - } - } - return p; - } - - public Matrix averageClosestDistanceMatrix(List la, int n) { - - Matrix mp = new Matrix(n); - - for ( int i = 0 ; i < n ; i++ ) { - for ( int j = i + 1 ; j < n ; j++ ) { - mp.set(i,j, findBestAlignment(la.get(i),la.get(j)) ); - mp.set(j,i, mp.get(i,j) ); - } - } - return mp; - } - - public SelectedPair findBestAlignment(MultipleAlignment a1, MultipleAlignment a2) { - - Map all_offsets = new HashMap(); - SelectedPair p = new SelectedPair(-1,-1,1e100); - - for ( Integer id1 : a1 ) { - for ( Integer id2 : a2 ) { - PairwiseAlignment pa = alignments.get(id1,id2); - if ( pa.getOverlap() <= 0 ) continue; // id1 and id2 do not overlap and/or we don't have p-wise alignment - - // record suggested offset of a2 wrt a1 (defined by their first sequences), and remember the - // pairwise alignment that suggested it - int suggested_offset = a1.getOffsetById(id1) + pa.getBestOffset2wrt1(id1,id2) - a2.getOffsetById(id2); - if ( ! all_offsets.containsKey(suggested_offset) ) { - all_offsets.put( suggested_offset , new PrimitivePair.Int(id1,id2)) ; - } - } - } - for ( Map.Entry offset_record : all_offsets.entrySet() ) { - - double d = averageDistanceForOffset(a1,a2,offset_record.getKey()); - if ( d < p.d() ) p.set(offset_record.getValue().first,offset_record.getValue().second,d); - } - return p; - } - - public double averageDistanceForOffset(MultipleAlignment a1, MultipleAlignment a2, int offset) { - - double d_av = 0; - int nseq = 0; - - for ( Integer id2 : a2 ) { - SelectedPair spo = averageDistanceForOffset(a1,id2,offset+a2.getOffsetById(id2)); - if ( spo.d() > 1e99 ) continue; - nseq++; - d_av += spo.d(); - } - if ( nseq == 0 ) return 1e100; - d_av /= nseq; - return d_av; - } - - /** Computes average distance from sequence i to multiple alignment a for the specified offset of 'i' wrt 'a' - * and returns that distance and pair of sequence indices, on which the specified offset is realized - * @param a - * @param i - * @param offset - * @return - */ - public SelectedPair averageDistanceForOffset(MultipleAlignment a, int i, int offset) { - - SelectedPair p = new SelectedPair(-1,-1,1e100); - - double d = 0; // will hold average distance - double dmin = 1e100; // used to find the nearest individual sequence in the pile - int nseq = 0; // number of sequences in the pile that have distance to sequence i defined - int best_id = -1; - - for ( Integer id : a ) { // for all sequences from the msa pile - PairwiseAlignment pa = alignments.get(i,id); - int new_off = offset - a.getOffsetById(id); // offset of i wrt id as suggested by - double dist_for_off; // distance between i and id for the given offset off - - // check if p-wise alignment has data for the specified offset: - boolean canuse = false; - if ( pa.alignmentExists() && pa.getBestOffset2wrt1(id,i) == new_off ) { - dist_for_off = distances.get(i,id); - canuse = true; // can use this alignment to stitch i to a - } - else { - // offset is different from what the pwise alignment suggests; recompute! - dist_for_off = PairwiseAlignment.distance(pa.getSequenceById(id),pa.getSequenceById(i),new_off); - } - if ( dist_for_off > 1e99 ) continue; // at this offset, i and id do not overlap, go check next id - d += dist_for_off; - nseq++; - if ( dist_for_off < dmin && canuse ) { - dmin = dist_for_off; - best_id = id; - } - - } - if ( nseq == 0 ) return p; - d /= (double)nseq; - p.set(i,best_id,d); - return p; - } - - /** Finds, among all sequences, the one farthest from the specified pile. Being - * the 'farthest' is defined as having the largest lower bound of the distances to all sequences in the pile. - * - * @param a alignment pile to find the closest sequence for - * @return index of the farthest sequence - */ - public int findFarthestFromPile(MultipleAlignment a) { - - double dmaxmin = 0; - int i_out = -1; - - for ( int i = 0 ; i < distances.size() ; i++ ) { - - if ( a.contains(i) ) continue; - double d_min = 1e100; // smallest distance from sequence i to the pile - - for ( Integer id : a ) { - double d = distances.get(i, id) ; - if (d < d_min ) d_min = d; - } - // d_min is the smallest distance from sequence i to pile a - if ( d_min > dmaxmin ) { - // sequence i is so far the farthest... - dmaxmin = d_min; - i_out = i; - } - } - return i_out; - } - - public double distance(MultipleAlignment a1, MultipleAlignment a2) { - double d = 1e100; - for ( Integer id1 : a1 ) { - for ( Integer id2 : a2 ) { - if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2); - } - } - return d; - } - - /** Computes the distances from each sequence in the pile to its closest - * neighbor (within the same pile), and returns the greatest among these distances. - * In other words, no sequence in the pile is farther than diameter() from its closest neighbor. - * @param a alignment pile to compute diameter for - * @return the greatest distance from a sequence to its closest neighbor within the pile - */ - public double diameter(MultipleAlignment a) { - double dmaxmin = 0.0; - if ( mVerbosityLevel >= PILESUMMARY ) System.out.print("\nclosest neighbor for each seq: ["); - Iterator ids1 = a.sequenceIdByOffsetIterator(); - while ( ids1.hasNext() ) { - Integer id1 = ids1.next(); - double d = 1e100; // will hold distance from id1 to its closest neighbor - for ( Integer id2 : a ) { - if ( id2 == id1 ) continue; - double dpair = distances.get(id1,id2) ; - d = Math.min(d,dpair); - } - // d = distance from id1 to its closest neighbor within the pile - if ( d < 1e99 && mVerbosityLevel >= PILESUMMARY ) System.out.printf("%8.4g",d); - if ( d < 1e99 && d > dmaxmin ) dmaxmin = d; - } - if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(" ]"); - // dmaxmin = the largest distance from a sequence in this pile to its closest neighbor -// System.out.println(); - return dmaxmin; - } - - public static void main(String argv[]) { - - int K=8; -// IndexedSequence [] seqs = testSet1(K); // initialize test set data -// IndexedSequence [] seqs = testSet2(K); // initialize test set data -// IndexedSequence [] seqs = testSet3(K); // initialize test set data - IndexedSequence [] seqs = testSet4(K); // initialize test set data - - PileBuilder pb = new PileBuilder("test1.bam",null,new DiscardingReceiver()); - - //pb.doMultipleAlignment(seqs); - pb.doMultipleAlignment2(seqs); - System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2)); - System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1)); - System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2)); - - System.out.println("PILE 1: \n"+pb.alignments1.toString()); - System.out.println("PILE 2: \n"+pb.alignments2.toString()); - } - - public void doMultipleAlignment(IndexedSequence[] seqs) { - // two piles we are going to grow until all sequences are assigned to one of them. - // we intend to keep the piles disjoint, e.g. no sequence should be placed in both - - - MultipleAlignment pile1 = new MultipleAlignment(); - MultipleAlignment pile2 = new MultipleAlignment(); - - initPairwiseAlignments(seqs); - - - // all the pairwise alignments are computed and disjoint best and next-best pairs are found - -// System.out.println( distances.format("%8.4g ")); - - - SelectedPair pworst = findWorst(); - - pile1.add(seqs[pworst.i()].getSequence(), pworst.i()); - pile2.add(seqs[pworst.j()].getSequence(), pworst.j()); - - - // initialize piles with best and next-best pairs -/* - SelectedPair p_best = findClosestPair(); - SelectedPair p_nextbest = findNextClosestPairAfter(p_best); - pile1.add( alignments.get(p_best.i(), p_best.j())); - pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j())); -*/ -/* - System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):"); - System.out.println(pile1.toString()); - System.out.println("Next best pair ("+p_nextbest.i() + "," + p_nextbest.j()+", d="+p_nextbest.d()+ "):"); - System.out.println(pile2.toString()); -*/ - SelectedPair p1 = null; - SelectedPair p2 = null; - - // grow piles hierarchical clustering-style - while ( pile1.size() + pile2.size() < seqs.length ) { - // find candidate sequences closest to each of the two piles - -// p1 = findClosestToPileAverage(pile1); // findClosestToPile(pile1); -// p2 = findClosestToPileAverage(pile2); //findClosestToPile(pile2); - p1 = findClosestToPile(pile1); // findClosestToPile(pile1); - p2 = findClosestToPile(pile2); //findClosestToPile(pile2); - int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1 - int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2 - if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) { - // pile1 and pile 2 are mutually the closest, so we need to merge them. - // if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2), - // so we just merge on one of the (redundant) instances: - pile1.add(pile2, alignments.get( p1.i(), p1.j())); - pile2.clear(); // need to reset pile 2 to something else - int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 - pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence - } else { - if ( p1.d() < p2.d() ) { - if ( pile2.contains(id1_cand) ) { - pile1.add(pile2, alignments.get( p1.i(), p1.j())); - pile2.clear(); // need to reset pile 2 to something else - int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 - pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence - } else pile1.add( alignments.get(p1.i(), p1.j()) ); - } else { - if ( pile1.contains(id2_cand) ) { - pile2.add(pile1, alignments.get( p2.i(), p2.j())); - pile1.clear(); // need to reset pile 2 to something else - int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1 - pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence - } else pile2.add( alignments.get(p2.i(), p2.j()) ); - } - } - System.out.println("PILE 1: \n"+pile1.toString()); - System.out.println("PILE 2: \n"+pile2.toString()); - } // end WHILE - - alignments1 = pile1; - alignments2 = pile2; -/* - * System.out.println("Closest distance to the pile: " + best_d - + "(adding: " + best_i + "," + best_j + "):"); - System.out.println(pile.toString()); - } -*/ - } - -public List doMultipleAlignment2(IndexedSequence[] seqs) { - - initPairwiseAlignments(seqs); - - List piles = new LinkedList(); - - int npiles = seqs.length; - - for ( int i = 0 ; i < seqs.length ; i++ ) { - MultipleAlignment m = new MultipleAlignment(); - m.add(seqs[i].getSequence(),i); - piles.add(m); - } - - while ( npiles > 2 ) { - Matrix dist = averageClosestDistanceMatrix(piles,npiles); - int best_i = -1; - int best_j = -1; - int pile_i = -1; - int pile_j = -1; - double d = 1e100; - for ( int i = 0 ; i < npiles ; i++ ) { - for ( int j = i+1 ; j < npiles ; j++ ) { - SelectedPair p = dist.get(i,j); - if ( p.d() < d ) { - d = p.d(); - pile_i = i; - pile_j = j; - best_i = p.i(); - best_j = p.j(); - } - } - } - - if ( d >= 1e99 ) break; // oops, we could not stitch any of the remaining piles together! - - // got the closest pair - piles.get(pile_i).add(piles.get(pile_j),alignments.get(best_i,best_j)); - // System.out.println("JOINED PILE: \n"+piles.get(pile_i).toString()); - piles.remove(pile_j); - npiles--; - } - -// alignments1 = piles.get(0); -// alignments2 = piles.get(1); - - -// System.out.println("PILE 1: \n"+piles.get(0).toString()); -// System.out.println("PILE 2: \n"+piles.get(1).toString()); - return piles; -} - - public static IndexedSequence[] testSet1(int K) { - IndexedSequence [] seqs = new IndexedSequence[9]; - seqs[0] = new IndexedSequence("CAAAAAAAGCAAAACTCTGAAGAAAGAGAGAGAGAGGGAGAGAGGGAGAGAGAAAGGGAGAGACGATGAGAGACAG",K); - seqs[1] = new IndexedSequence("GCAAAACTCTGAAGAAAGAGAGAGAGAGGGAGAGAGGGAGAGAGAAAGGAAGAGACGAT",K); - seqs[2] = new IndexedSequence("AACTCTGAAGAAAGAGAGAGAGAGGGAGAGAGGGAGAGAGAAAGGAAGAGACGATGAGA",K); - seqs[3] = new IndexedSequence("GAGAGGGAGAGAGAAAGGAAGAGACGATGAGAGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAAC",K); - seqs[4] = new IndexedSequence("ACGATGAGAGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAACAAACTAGAAATCGAGCAGGAAAA",K); - seqs[5] = new IndexedSequence("GAGAGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAACAAACTAGAAATCGAGCAGGAACCTTGGA",K); - seqs[6] = new IndexedSequence("TGAGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAACAAACTAGAAATC",K); - seqs[7] = new IndexedSequence("AGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAACAAACTAGAAATCGAGCAGGAACCTTGGAGGA",K); - seqs[8] = new IndexedSequence("AGACAGAGAAGGAGAGAGAAAGTACAAAAGAACGAATGAACGAACAAACTAGAAATCGAGCAGGAACCTTGGAGGA",K); - return seqs; - } - - public static IndexedSequence[] testSet2(int K) { - IndexedSequence [] seqs = new IndexedSequence[11]; - seqs[0] = new IndexedSequence("TGCAATGAGATGAGATCGTGCCTCTGCACTCCAGCCTGGGCGACAGAGTGAGAGACCCTGTCTCAAAAACACAAAA",K); - seqs[1] = new IndexedSequence("AATGAGATGAGATCGTGCCTCTGCACTCCAGCCTGGGCGACAGAGTGAGAGACCCTGTCTCAAAAACACAAAAACA",K); - seqs[2] = new IndexedSequence("CCTCTGCACTCCAGCCTGGGCGACAGAGTGAGAGACCCTGTCTCAAAAACACAAAAACAACAACAACAAAAAAACA",K); - seqs[3] = new IndexedSequence("CAGAGTGAGAGACCCTGTCTCAAAAACACAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCT",K); - seqs[4] = new IndexedSequence("CAGAGTGAGAGACCCTGTCTCAAAAACACAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCT",K); - seqs[5] = new IndexedSequence("GAGACCCTGTCTCAAAAACACAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAG",K); - seqs[6] = new IndexedSequence("CCCTGTCTCAAAAACACAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAGTGTT",K); - seqs[7] = new IndexedSequence("CCAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAGTG",K); - seqs[8] = new IndexedSequence("CAAAAACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAGTGTTGTTATCTCTGGGGAGT",K); - seqs[9] = new IndexedSequence("AACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAGTGTTGTTATCTCTGGGTAGTTTGG",K); - seqs[10] = new IndexedSequence("ACAACAACAACAAAAAAACACCAATCTGAGCAAATACTGCCCTAAACCGAGTGTTGTTATCTCTGGGTAGCTTGGA",K); - return seqs; - } - - public static IndexedSequence[] testSet3(int K) { - IndexedSequence [] seqs = new IndexedSequence[11]; - seqs[0] = new IndexedSequence("TGGAAATTTATTTCTCAGAGTACTGGAAGCTGGGAATCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGG",K); - seqs[1] = new IndexedSequence("TGGAAATTTATTTCTCAAAGTACTGGAAGCTGGGAATCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGG",K); - seqs[2] = new IndexedSequence("GGAAATTTATTTCTCAGAGTACTGGAAGCTGGGAATCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGG",K); - seqs[3] = new IndexedSequence("GGAAATTTATTTCACAGAGTAATGGAAGCTGGGAATCCAAGATCAAAATGCCAGCAGCTTCTAAGTCTGCTGAGGG",K); - seqs[4] = new IndexedSequence("ATTTCTCAGAGTACTGGAAGCTGGGAATCCAAGATCGAAATGCCAGCAGATTCTAAGTC",K); - seqs[5] = new IndexedSequence("ATTTCTCAGAGTACTGGAAGCTGGGACTCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGGTAGGGTGC",K); - seqs[6] = new IndexedSequence("GTACTGGAAGCTGGGAATCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGGTAGGGTGCACTCTCTGCT",K); - seqs[7] = new IndexedSequence("AATCCAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGGTAGGGTGCACTCTCTGCTTCATAAATGGGTCTC",K); - seqs[8] = new IndexedSequence("CAAGATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGGTAGGGCGCACTCTCTGCTTCATAAATGGGTCTCTTGC",K); - seqs[9] = new IndexedSequence("ATCAAAATGCCAGCAGATTCTAAGTCTGGTGAGGGTAGGGTGCACTCTCTGCTTCATAAATGGGTCTCTTGCCGCA",K); - seqs[10] = new IndexedSequence("GTCTGGTGAGGGTAGGGTGCACTCTCTGCTTCATAAATGGGTCTCTTGCCGCAAAAAAATCTGTTTGCTCCTCCAG",K); - return seqs; - } - - public static IndexedSequence[] testSet4(int K) { - IndexedSequence [] seqs = new IndexedSequence[19]; - seqs[0] = new IndexedSequence("CGTGTGTGTGTGTGCAGTGCGTGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTTTGTGAGATC",K); - seqs[1] = new IndexedSequence("ATGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCAT",K); - seqs[2] = new IndexedSequence("GTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGC",K); - seqs[3] = new IndexedSequence("TGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCA",K); - seqs[4] = new IndexedSequence("GTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCAT",K); - seqs[5] = new IndexedSequence("GTGTGTGTGCCGTGCTTTGTGCTGTGAGATCTGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCTGCAT",K); - seqs[6] = new IndexedSequence("GTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGT",K); - seqs[7] = new IndexedSequence("GTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGT",K); - seqs[8] = new IndexedSequence("TGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTG",K); - seqs[9] = new IndexedSequence("AGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGT",K); - seqs[10] = new IndexedSequence("TGGGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGCAGCGCATGGTGCTGTGTGAGATCAGCGTGTGTGTGTGCAG",K); - seqs[11] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGAGCAGTGCATGGGGATGTGTGAGATCAGCATGTGTGTGTGTGTGCAGCGCG",K); - seqs[12] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCA",K); - seqs[13] = new IndexedSequence("AGATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGC",K); - seqs[14] = new IndexedSequence("AGATCAGCGTGTGTGTGTGCAGCGCATGGCGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCGGCGCATGGGGGTG",K); - seqs[15] = new IndexedSequence("GATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGAATGTGTGTGTGTGTGCAGTGCATGGTGCT",K); - seqs[16] = new IndexedSequence("ATCAGCATGGGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGGGTGTGTGGGGTGGGTGGTGGTG",K); - seqs[17] = new IndexedSequence("ATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGGGCTG",K); - seqs[18] = new IndexedSequence("GTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGCTGAGTGTG",K); - return seqs; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/RecordPileReceiver.java b/archive/java/src/org/broadinstitute/sting/oldindels/RecordPileReceiver.java deleted file mode 100644 index 3aab323e3..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/RecordPileReceiver.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.SAMRecord; - -import java.util.Collection; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 19, 2009 - * Time: 7:51:47 PM - * To change this template use File | Settings | File Templates. - */ - -/** This interface abstracts processing of piles (collections) of SAM records. - * Its only receive() method should be called to send a collection of records - * to the implementation. - */ - -public interface RecordPileReceiver { - public void receive(Collection c) ; -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/RecordReceiver.java b/archive/java/src/org/broadinstitute/sting/oldindels/RecordReceiver.java deleted file mode 100644 index 389b0c43f..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/RecordReceiver.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 19, 2009 - * Time: 7:28:40 PM - * To change this template use File | Settings | File Templates. - */ - -/** This interface abstracts processing of SAM records. Its only receive() method should be called to send a record - * to the implementation. - */ -public interface RecordReceiver { - public void receive(SAMRecord r); -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/SequencePile.java b/archive/java/src/org/broadinstitute/sting/oldindels/SequencePile.java deleted file mode 100755 index cf0c9b320..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/SequencePile.java +++ /dev/null @@ -1,275 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.util.List; -import java.util.ArrayList; -import net.sf.samtools.*; - -public class SequencePile { - private List mSeqGrid; - private StringBuilder mRefGrid; - private StringBuilder headerGrid; - private int mDepth; - private List mSeqRC; - - - public SequencePile(String ref) { - mRefGrid = new StringBuilder( ref ); - headerGrid = new StringBuilder(); - for ( int i = 0; i < ref.length(); i++ ) headerGrid.append(' '); - mSeqGrid = new ArrayList(); - for ( int i = 0 ; i < mRefGrid.length(); i++ ) { - mSeqGrid.add(new MSAColumn()); - } - mDepth = 0; - mSeqRC = new ArrayList(); - } - - /** Adds to the pile nucleotide sequence that aligns at zero-based position - relative to the original reference stretch the pile is built upon; the detailed alignment - of the sequence to that reference stretch is specified by the . - - @param seq nucleotide sequence - @param isRC true indicates that RC of the is being aligned - @param cigar specification of the alignment of the sequence to the reference - @param refpos 0-based position of the alignment with respect to the original stretch of the reference - that was passed to the pile's constructor. Either or +sequence_length can be outside of - the pile's boundaries, the SequencePile class will deal with such situations correctly. - */ - public void addAlignedSequence(String seq, boolean isRC, Cigar cigar, int refpos) { - - String alignedSeq = seq ; -// if ( isRC ) { -// alignedSeq = ReverseComplement(seq); -// } else alignedSeq = seq; - mSeqRC.add(isRC); - - // will hold actual position on the grid; reference can have insertions on the grid, - // so position on the grid where we should start placing the read is not refpos! - int pos = 0; - for ( int i = 0 ; i < refpos ; i++ ) { // i is the position on the original reference - // if we got some insertions on the reference prior to refpos, we need to count them in: - while( mRefGrid.charAt(pos) == '+' ) { - mSeqGrid.get(pos).add(' '); // add additional spaces in the line that will hold sequence seq - pos++; - } - mSeqGrid.get(pos).add(' '); // fill with ' ' to the left of the read - pos++; - } - - // we reached start position of the alignment on the reference grid - - int readpos = 0; // position on the read - - for ( int i = 0 ; i < cigar.numCigarElements() ; i++ ) { - - final CigarElement ce = cigar.getCigarElement(i); - - switch(ce.getOperator()) { - case I: // read has an insertion - for ( int j = 0 ; j < ce.getLength() ; j++ ) { - if ( pos >= mRefGrid.length() ) break; - if ( pos >= 0 ) { - if ( mRefGrid.charAt(pos) !='+' ) { // there was no insertion here yet: add it now! - mRefGrid.insert(pos, '+'); - headerGrid.insert(pos,'+'); - MSAColumn c = new MSAColumn(); - // reads up to the previous depth (prior to adding current read) did not - // have an insertion here, so we insert '*' into all of them: - for ( int k = 0 ; k < mDepth ; k++ ) { - if ( mSeqGrid.get(pos-1).charAt(k) == ' ') c.add(' '); - else c.add('*'); - } - mSeqGrid.add(pos, c); // finally, add the base from the current read - } - mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); - } - readpos++; - pos++; - } - break; - case D: // read has a deletion - for ( int j = 0 ; j < ce.getLength() ; j++ ) { - while( pos < mRefGrid.length() && mRefGrid.charAt(pos) == '+' ) { // skip insertions on the ref - mSeqGrid.get(pos).add('*'); - pos++; - } - if ( pos >= mRefGrid.length() ) break; - mSeqGrid.get(pos).add('-'); // mark deletion - headerGrid.setCharAt(pos,'-'); - pos++; - } - break; - case M: - for ( int j = 0 ; j < ce.getLength() ; j++ ) { - // if ref has an insertion, but the read does not: skip the insertion and continue with "gapless" alignment - while( pos < mRefGrid.length() && mRefGrid.charAt(pos) == '+' ) { - mSeqGrid.get(pos).add('*'); - pos++; - } - if ( pos >= mRefGrid.length() ) break; - mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); - if ( Character.toUpperCase(alignedSeq.charAt(readpos)) != - Character.toUpperCase(mRefGrid.charAt(pos)) - && headerGrid.charAt(pos)== ' ') headerGrid.setCharAt(pos,'*'); - pos++; - readpos++; - } - break; - default : throw new IllegalArgumentException("Unknown cigar element"); - } - } - for ( int i = pos ; i < mRefGrid.length() ; i++ ) { // i is the position on the modified reference - mSeqGrid.get(i).add(' '); // fill with ' ' to the left of the read - } - mDepth++; - } - - public String format() { - StringBuffer b = new StringBuffer(); - b.append(" "); - b.append(mRefGrid); - b.append('\n'); - - try { - for ( int i = 0 ; i < mDepth; i++ ) { - if ( mSeqRC.get(i).booleanValue() ) b.append("<-"); - else b.append("->"); - for ( int j = 0 ; j < mRefGrid.length() ; j++) { - b.append(mSeqGrid.get(j).charAt(i)); - } - b.append('\n'); - } - } catch (Exception e) {} - return b.toString(); - } - - private String ReverseComplement(String s) { - StringBuffer b = new StringBuffer(); - char [] data = s.toCharArray(); - for ( int i = data.length - 1 ; i >= 0 ; i-- ) b.append(BaseComplement(data[i])); - return b.toString(); - } - - private char BaseComplement(char b) { - switch ( b ) { - case 'A' : return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - default: throw new IllegalArgumentException(b + " is not a DNA base"); - } - } - - public void colorprint() { colorprint(false); } - - public void dotprint(boolean printId) { - - String skip = null; - if ( printId ) skip = new String(" "); - else skip = new String(" "); - - System.out.print(formatHeader(skip)); - System.out.print(skip); - System.out.println(mRefGrid); - - try { - for ( int i = 0 ; i < mDepth; i++ ) { - if ( printId ) System.out.printf("%3d",i); - if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-"); - else System.out.print("->"); - for ( int j = 0 ; j < mRefGrid.length() ; j++) { - char seqbase = mSeqGrid.get(j).charAt(i); - char refbase = mRefGrid.charAt(j); - if ( isBase(refbase) && isBase(seqbase) && - Character.toUpperCase(refbase) == - Character.toUpperCase(seqbase) ) { - if ( mSeqRC.get(i) ) System.out.print(','); - else System.out.print('.'); - } - else System.out.print(seqbase); - } - System.out.print('\n'); - } - } catch (Exception e) {} - } - - - public void colorprint(boolean printId) { - - String skip = null; - if ( printId ) skip = new String(" "); - else skip = new String(" "); - - System.out.print(formatHeader(skip)); - System.out.print(skip); - System.out.println(mRefGrid); - - try { - for ( int i = 0 ; i < mDepth; i++ ) { - if ( printId ) System.out.printf("%3d",i); - if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-"); - else System.out.print("->"); - for ( int j = 0 ; j < mRefGrid.length() ; j++) { - char seqbase = mSeqGrid.get(j).charAt(i); - char refbase = mRefGrid.charAt(j); - if ( isBase(refbase) && isBase(seqbase) && - Character.toUpperCase(refbase) != - Character.toUpperCase(seqbase) ) System.out.print("\033[31m"+seqbase+"\033[30m"); - else System.out.print(seqbase); - } - System.out.print('\n'); - } - } catch (Exception e) {} - } - - private String formatHeader(String leadString) { - char [][] mm_strings = new char[2][mRefGrid.length()]; - for ( int i = 0 ; i < mRefGrid.length() ; i++ ) { - int count = 0; - char refC = mRefGrid.charAt(i); - MSAColumn col = mSeqGrid.get(i); - if ( refC == '+' ) { - // count number of observations for insertion - for ( int j = 0 ; j < col.size() ; j++ ) { - if ( col.charAt(j) != '*' && col.charAt(j) != ' ') count++; - } - } else { - if ( headerGrid.charAt(i) == '-' ) { - // count number of observations for deletion - for ( int j = 0 ; j < col.size() ; j++ ) { - if ( col.charAt(j) == '-' ) count++; - } - } else { - if ( headerGrid.charAt(i) == '*') { - for ( int j = 0 ; j < col.size() ; j++ ) { - if ( col.charAt(j)!=' ' && - Character.toUpperCase(col.charAt(j)) != - Character.toUpperCase(refC) ) count++; - } - } - } - } - if ( count > 9 ) mm_strings[0][i] = Character.forDigit(count/10,10); - else mm_strings[0][i] = ' '; - if ( count > 0 ) mm_strings[1][i] = Character.forDigit(count%10,10); - else mm_strings[1][i] = ' '; - } - - StringBuilder b = new StringBuilder(); - b.append(leadString); - b.append(mm_strings[0]); - b.append('\n'); - b.append(leadString); - b.append(mm_strings[1]); - b.append('\n'); - b.append(leadString); - b.append(headerGrid); - b.append('\n'); - return b.toString(); - } - - private boolean isBase(char b) { - b = Character.toUpperCase(b); - return ( b=='A' ||b == 'C' || b=='G' || b=='T' || b=='N'); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/ShowMSA.java b/archive/java/src/org/broadinstitute/sting/oldindels/ShowMSA.java deleted file mode 100755 index afef06875..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/ShowMSA.java +++ /dev/null @@ -1,230 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -import java.io.File; - -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.reference.ReferenceSequenceFileWalker; -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.Usage; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; - -public class ShowMSA extends CommandLineProgram { - - // Usage and parameters - @Usage(programVersion="1.0") public String USAGE = "Prints MSA into stdout\n"; - @Option(shortName="I", doc="SAM or BAM file with alignment data") public File INPUT_FILE; - @Option(shortName="L", doc="Contig:Start-Stop or Contig:poslocation of the window to draw") public String LOCATION; - @Option(shortName="W", doc="Number of bases on each side of specified position if LOCATION is in Contig:pos format; ignored otherwise", optional=true) public Integer WINDOW; - @Option(shortName="R", doc="Reference fastb file") public File REF_FILE; - @Option(shortName="P", doc="If true, then any read (partially) overlapping with the specified region will be shown. "+ - "Otherwise (default), only reads fully contained in the specified interval are shown", optional=true) public Boolean PARTIAL; - @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE; - @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; - @Option(shortName="F",doc="Format: PILE - show alignment, FASTA - print sequences in fasta",optional=true) public String OUT_FORMAT; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new ShowMSA().instanceMain(argv)); - } - - protected int doWork() { - - if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) { - System.out.println("Unknown value specified for ERR_MODE"); - return 1; - } - - if ( PARTIAL == null ) PARTIAL = new Boolean(false); - if ( OUT_FORMAT == null ) OUT_FORMAT=new String("PILE"); - - if ( ! OUT_FORMAT.equals("PILE") && ! OUT_FORMAT.equals("FASTA")) { - System.out.println("OUT_FORMAT can only have values PILE or FASTA"); - return 1; - } - - if ( ! INPUT_FILE.exists() ) { - System.out.println("Specified INPUT_FILE does not exist"); - return 1; - } - - if ( ! REF_FILE.exists() ) { - System.out.println("Specified REF_FILE does not exist"); - return 1; - } - - if ( LOCATION.indexOf(':') == -1 ) { - System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format"); - return 1; - } - String[] s1 = LOCATION.split(":"); - int contig; - try { - contig = Integer.valueOf(s1[0]); - } catch (NumberFormatException e) { - System.out.println("LOCATION: contig must be specified as an integer"); - return 1; - } - - if ( s1.length != 2 ) { - System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format"); - return 1; - } - - String s2[] = s1[1].split("-"); - if ( s2.length > 2 ) { - System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format"); - return 1; - } - int left, right; - if ( s2.length == 2 ) { - try { - left = Integer.valueOf(s2[0]); - right = Integer.valueOf(s2[1]); - } catch (NumberFormatException e) { - System.out.println("LOCATION: window boundaries should be specified as integers"); - return 1; - } - } else { - int pos = 0; - try { - pos = Integer.valueOf(s2[0]); - } catch (NumberFormatException e) { - System.out.println("LOCATION: position on the contig should be specified as an integer"); - return 1; - } - if (WINDOW == null ) { - System.out.println("WINDOW must be specified when LOCATION specifies a single poisiton (Contig:Pos)"); - return 1; - } - left = pos - WINDOW.intValue(); - right = pos+WINDOW.intValue(); - } - - - String ref_contig ; - - try { - ReferenceSequenceFileWalker mRefReader = - new ReferenceSequenceFileWalker(ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE)); - ref_contig = mRefReader.get(contig).toString(); // reload ref - } catch (Exception e) { - System.out.println("Failed to read reference sequence from " + REF_FILE); - return 1; - } - - SAMFileReader reader ; - try { - reader = new SAMFileReader(INPUT_FILE); - } catch ( Exception e) { - System.out.println(e.getMessage()); - return 1; - } - - SequencePile msa=null; - - if ( OUT_FORMAT.equals("PILE")) { - msa = new SequencePile(ref_contig.substring(left-1, right)); - } else { - System.out.println(">reference "+contig+":"+left+"-"+right); - System.out.println(ref_contig.substring(left-1, right)); - } - - for( SAMRecord r : reader ) { - if ( r.getReadUnmappedFlag() ) continue; - if ( r.getReferenceIndex() < contig ) continue; - if ( r.getReferenceIndex() > contig ) break; - if ( r.getAlignmentEnd() < left ) continue; - if ( r.getAlignmentStart() >= right ) break; - if ( ! PARTIAL && ( r.getAlignmentStart() < left || r.getAlignmentEnd() >= right ) ) continue; - - int err = -1; - if ( ERR_MODE.equals("MM")) err = numMismatches(r); - else if ( ERR_MODE.equals("ERR")) err = numErrors(r); - else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r); - if ( err > MAX_ERRS ) continue; - - if ( OUT_FORMAT.equals("PILE") ) { - msa.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), r.getCigar(), r.getAlignmentStart() - left); - } else { - System.out.print(">read "+r.getReadName()); - if ( r.getReadNegativeStrandFlag() ) System.out.println("(rc)"); - else System.out.println("(fw)"); - System.out.println(r.getReadString()); - } - } - - if ( OUT_FORMAT.equals("PILE") ) msa.colorprint(); -//// System.out.println(msa.format()); - - return 0; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD - * - * @param r SAM record that must specify an alignment - * @return number of errors (number of mismatches plus total length of all insertions/deletions - * @throws RuntimeException if cigar contains any elements other than M,I,D - */ - private static int numErrors(SAMRecord r) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - int errs = numMismatches(r); - - // now we have to add the total length of all indels: - Cigar c = r.getCigar(); - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator()) { - case M : break; // we already have correct number of mismatches - case I : - case D : - errs += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return errs; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD - * - * @param r SAM record that must specify an alignment - * @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or - * deletion will be counted as a single error regardless of the length) - * @throws RuntimeException if cigar contains any elements other than M,I,D - */ - private static int numMismatchesGaps(SAMRecord r) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - int errs = numMismatches(r); - - // now we have to add the total length of all indels: - Cigar c = r.getCigar(); - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator()) { - case M : break; // we already have correct number of mismatches - case I : - case D : - errs++; - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return errs; - } - - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ - private static int numMismatches(SAMRecord r) throws RuntimeException { - - // NM currently stores the total number of mismatches in all blocks + 1 - return ((Integer)r.getAttribute("NM")).intValue() - 1; - - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/SimpleInterval.java b/archive/java/src/org/broadinstitute/sting/oldindels/SimpleInterval.java deleted file mode 100644 index 051672950..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/SimpleInterval.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 19, 2009 - * Time: 12:44:37 PM - * To change this template use File | Settings | File Templates. - */ - -/** Provides minimum complete implementation of Interval interface. - */ -public class SimpleInterval implements Interval { - - private long m_start; - private long m_stop; - - public SimpleInterval(long start, long stop) { m_start = start; m_stop = stop; } - - public SimpleInterval(Interval i) { m_start = i.getStart(); m_stop = i.getStop(); } -// public SimpleInterval() { m_start = -1; m_stop = -2; } - - /** Start position of the interval. - * - * @return for the interval [start,stop] - */ - @Override - public long getStart() { return m_start; } - - /** Sets start position of the interval. - * - * @param s start coordinate - */ - public void setStart(long s) { m_start = s; } - - /** End position of the interval. - * - * @return for the interval [start,stop] - */ - @Override - public long getStop() { return m_stop; } - - /** Sets stop position of the interval. - * - * @param s stop coordinate - */ - public void setStop(long s) { m_stop = s; } - - /** Length of the interval. This default implementation returns getStop() - getStart() + 1. - * - * @return - */ - @Override - public long getLength() { return (m_stop - m_start + 1); }; - - /** Returns true if this interval overlaps with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * @param i Another interval - * @return true iff intervals overlap - */ - @Override - public boolean overlapsP(org.broadinstitute.sting.playground.utils.Interval i) { - return ! disjointP(i); - } - - /** Returns true if this interval does not overlap with i as judjed by getStart() and getStop() positions of the - * two interval objects. - * @param i Another interval - * @return true iff intervals do not overlap - */ - @Override - public boolean disjointP(org.broadinstitute.sting.playground.utils.Interval i) { - return ( i.getStop() < this.m_start || i.getStart() > this.m_stop ); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/StrictlyUpperTriangularMatrix.java b/archive/java/src/org/broadinstitute/sting/oldindels/StrictlyUpperTriangularMatrix.java deleted file mode 100755 index 3c7f0c593..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/StrictlyUpperTriangularMatrix.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -public class StrictlyUpperTriangularMatrix extends SymmetricMatrix { - - public StrictlyUpperTriangularMatrix(int dimension) { - super(dimension); - assert dimension >=2 : "Distance matrix can not be smaller than 2x2"; - } - - public double get(int i, int j) { - if ( i >= j ) return 0.0; - return super.get(i,j); - } - - public void set(int i, int j, double value) { - assert i < j : "Only i < j elements can be set in strictly upper diagonal matrix" ; - super.set(i,j,value); - } - - - - private static void testMe() { - StrictlyUpperTriangularMatrix m = new StrictlyUpperTriangularMatrix(3); - - m.set(0,1,0.54321); - m.set(0,2,0.43215); - m.set(1,2,0.321); - - System.out.println( m.format()); - } - - public static void main(String[] argv) { - testMe(); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/SymmetricMatrix.java b/archive/java/src/org/broadinstitute/sting/oldindels/SymmetricMatrix.java deleted file mode 100644 index b7c2509da..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldindels/SymmetricMatrix.java +++ /dev/null @@ -1,87 +0,0 @@ -package org.broadinstitute.sting.playground.indels; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 22, 2009 - * Time: 3:43:05 PM - * To change this template use File | Settings | File Templates. - */ -public class SymmetricMatrix { - - protected double [] data; - protected int size; - - public SymmetricMatrix(int dimension) { - assert dimension >= 0 : "Matrix size can not be negative"; - if ( dimension % 2 == 0 ) { - int k = dimension >> 1; // dimension/2 - data = new double[k*(dimension+1)]; - } else { - int k = ( dimension + 1 ) >> 1; // (dimension + 1)/2 - data = new double[k*dimension]; - } - size = dimension; - } - - public double get(int i, int j) { - assert (i < size) && ( j < size) : "Out of bound index into matrix detected"; - if ( i >= j ) return data[linearOffset(j,i)]; // we store only the upper triangle in memory - return data[ linearOffset(i,j) ]; - } - - public void set(int i, int j, double value) { - assert (i < size) && (j < size) : "Out of bound index into matrix detected"; - - if ( i >= j ) data[ linearOffset(j,i) ] = value; - else data[ linearOffset(i,j) ] = value; - } - - public int size() { return size; } - - public int nRows() { return size; } - public int nCols() { return size; } - - /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!); - * each element is formatted according to a default format. - * @return - * @see #format(String f) - */ - public String format() { - return format("%6.3f "); - } - - - /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!); - * each element is formatted according to a specified format string (note: format string must include all desired - * whitespaces before and/or after an element, as this method itself does not add any spaces between the formatted elements). - * @return - * @see #format() - */ - public String format(String f) { - StringBuilder b = new StringBuilder(); - java.util.Formatter frm = new java.util.Formatter(b); - for ( int i = 0 ; i < size ; i++ ) { - for ( int j = 0 ; j < size ; j++ ) { - frm.format(f, get(i,j)); - } - b.append('\n'); - } - return b.toString(); - } - - - /** Computes linear offset into the internal array that keeps actual data, given "row" and "column" indices - * into the matrix; this method is unchecked, but it expects i <= j otherwise the result is unspecified. - * @param i row index - * @param j column index - * @return linear offset into the data[] member of this class - */ - private int linearOffset(int i, int j) { - int k = (( size << 1 ) - i + 1)*i; // [ 2*d - (i+1) ] * i - k >>= 1; // k/=2 - // now k is the offset of the first stored element in row i - return ( k + j - i ); - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AbstractFirecrestFileParser.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AbstractFirecrestFileParser.java deleted file mode 100644 index 80d42682a..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AbstractFirecrestFileParser.java +++ /dev/null @@ -1,133 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -//package edu.mit.broad.picard.illumina; -package org.broadinstitute.sting.secondarybase; - -import edu.mit.broad.picard.util.BasicTextFileParser; - -import java.io.Closeable; -import java.io.File; -import java.io.FilenameFilter; -import java.util.*; - -/** - * Abstract base class for implementing parsers for various versions of Firecrest output - */ -public abstract class AbstractFirecrestFileParser implements Iterator, Iterable, Closeable { - protected final int lane; - protected final File firecrestDirectory; - private FirecrestReadData next = null; - private boolean iterating = false; - - /** - * Examine the bustard directory to see if it is valid, and prepare for parsing - */ - public AbstractFirecrestFileParser(final File firecrestDirectory, final int lane) { - this.lane = lane; - this.firecrestDirectory = firecrestDirectory; - } - - /** - * @return true if the given bustard directory contains the appropriate files, or at least enough - * of them so that it appears to be a Firecrest directory corresponding to the version of the concrete - * FirecrestFileParser implementation. - */ - public abstract boolean isValidFirecrestDirectory(); - - /** - * Called before iteration begins. If this method is called when isValidFirecrestDirectory() had - * return false, it will generate exceptions that may help the user diagnose the problem. - */ - protected abstract void prepareToIterate(); - - /** - * @return the next read - */ - protected abstract FirecrestReadData readNext(); - - - /** - * @return an iterator over a set of elements of type FirecrestReadData - */ - public Iterator iterator() { - if (iterating) { - throw new IllegalStateException("iterator() method can only be called once, before the first call to hasNext()"); - } - prepareToIterate(); - next = readNext(); - iterating = true; - return this; - } - - /** - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - if (!iterating) { - iterator(); - } - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration - * @throws java.util.NoSuchElementException - */ - public FirecrestReadData next() { - - if (!hasNext()) { - throw new NoSuchElementException("Iteration has no more elements."); - } - - final FirecrestReadData result = next; - next = readNext(); - return result; - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported."); - } - - /** - * Override, e.g. to close parser - */ - public void close() { - } - - public int getLane() { return this.lane; } - - /** - * Convenience method to create a parser for a list of files of the same format that should - * be parsed in order defined by FirecrestFilenameComparator - * @param files to be iterated, in arbitrary order - * @return parser that iterates through the files in the appropriate order - */ - protected BasicTextFileParser makeParserForTextFiles(final boolean treatGroupedDelimitersAsOne, File[] files) { - final SortedSet sortedRead1 = new TreeSet(new FirecrestFilenameComparator()); - sortedRead1.addAll(Arrays.asList(files)); - files = sortedRead1.toArray(files); - return new BasicTextFileParser(treatGroupedDelimitersAsOne, files); - } - - protected File[] getFilesMatchingRegexp(final String regexp) { - return firecrestDirectory.listFiles( new FilenameFilter() { - public boolean accept(final File dir, final String name) { - return name.matches(regexp); - } - }); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AddFourProbsToSAM.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AddFourProbsToSAM.java deleted file mode 100755 index 7d918801d..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AddFourProbsToSAM.java +++ /dev/null @@ -1,94 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.util.HashMap; - -public class AddFourProbsToSAM extends CommandLineProgram { - public static AddFourProbsToSAM Instance = null; - - public File UNALIGNED_SAM; - public File ALIGNED_SAM; - public File FINAL_SAM; - public int END; - public Boolean DEBUG = false; - - public static void main(String[] argv) { - Instance = new AddFourProbsToSAM(); - start(Instance, argv); - } - - protected void setupArgs() { - //m_parser.addRequiredArg("unaligned_sam", "U", "Unaligned SAM file", "UNALIGNED_SAM"); - //m_parser.addRequiredArg("aligned_sam", "A", "Aligned SAM file", "ALIGNED_SAM"); - //m_parser.addRequiredArg("final_sam", "F", "Final SAM file", "FINAL_SAM"); - //m_parser.addRequiredArg("end", "E", "Pair end (0 - all, 1 - first, 2 - second)", "END"); - //m_parser.addOptionalFlag("debug", "D", "Turn on debugging output", "DEBUG"); - } - - protected int execute() { - int processed; - - SAMFileReader alignedSf = new SAMFileReader(ALIGNED_SAM); - alignedSf.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - // First, hash the aligned records (because there are less of them than unaligned reads) - System.err.println("Hashing aligned records..."); - - HashMap records = new HashMap(10000000); - processed = 0; - for (SAMRecord alignedSr : alignedSf) { - if (END == 0 || (END == 1 && alignedSr.getSecondOfPairFlag() == false) || (END == 2 && alignedSr.getSecondOfPairFlag() == true)) { - if (!alignedSr.getReadUnmappedFlag()) { - records.put(alignedSr.getReadName(), alignedSr); - - if (processed % 100000 == 0) { System.err.print("\tProcessed " + processed + " records.\r"); } - processed++; - } - } - } - - // Now, iterate over the unaligned SAM file and stick the four-base probs in. - System.err.println("\nInterating over unaligned records..."); - - SAMFileReader unalignedSf = new SAMFileReader(UNALIGNED_SAM); - unalignedSf.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - SAMFileHeader swhead = alignedSf.getFileHeader(); - swhead.setSortOrder(SAMFileHeader.SortOrder.unsorted); - SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(swhead, true, FINAL_SAM); - - processed = 0; - for (SAMRecord unalignedSr : unalignedSf) { - if (records.containsKey(unalignedSr.getReadName())) { - SAMRecord alignedSr = records.get(unalignedSr.getReadName()); - - byte[] sq = (byte[]) unalignedSr.getAttribute("SQ"); - if (alignedSr.getReadNegativeStrandFlag()) { - sq = QualityUtils.reverseComplementCompressedQualityArray(sq); - } - - alignedSr.setAttribute("SQ", sq); - alignedSr.setAttribute("KB", unalignedSr.getReadBases()); - alignedSr.setAttribute("KQ", unalignedSr.getBaseQualities()); - sw.addAlignment(alignedSr); - - if (DEBUG) { - System.out.println(alignedSr.format()); - } - - if (processed % 100000 == 0) { System.err.print("\tProcessed " + processed + " records.\r"); } - processed++; - } - } - - sw.close(); - alignedSf.close(); - unalignedSf.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AnnotateSecondaryBase.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AnnotateSecondaryBase.java deleted file mode 100755 index 4db9fdeea..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/AnnotateSecondaryBase.java +++ /dev/null @@ -1,344 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import net.sf.samtools.*; -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. - * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those - * generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities - * is evaluated against this model to determine the base probability distribution of a given - * base observation. - * - * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", - * agree on the identity of the best base. In these cases, we simply annotate our estimate - * of the second-best base. In cases where this method and Bustard disagree, we annotate - * the secondary base as this method's primary base. - * - * @author Kiran Garimella - */ - -/* - An example invocation: - java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \ - -D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \ - -L 5 \ - -B 30R47AAXX090217 - -CR '0-75,76-151' \ - -SO ~/test.sam \ - -SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \ - */ - -public class AnnotateSecondaryBase extends CommandLineProgram { - public static AnnotateSecondaryBase Instance = null; - - @Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; - @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; - @Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE; - @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES; - @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; - @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; - @Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000; - @Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; - @Argument(fullName="unaligned_sam", shortName="US", doc="Unaligned sam file, so we can skip making it", required=false) public File USAM; - @Argument(fullName="aligned_sam", shortName="AS", doc="Aligned, queryname-sorted sam file, so we can skip resorting it", required=false) public File ASAM; - - public static void main(String[] argv) { - Instance = new AnnotateSecondaryBase(); - start(Instance, argv); - } - - protected int execute() { - ArrayList> cycleRanges = getCycleRanges(CYCLE_RANGES); - File unalignedSam; - - if (USAM == null || !USAM.exists()) { - BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT); - - // Iterate through raw Firecrest data and store the first N reasonable reads up to TRAINING_LIMIT - System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reasonable reads in the raw data..."); - trainer.loadFirstNReasonableReadsTrainingSet(); - - // Iterate through the stored training data and add the info to the BasecallingReadModel - System.out.println("Applying training set..."); - BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData()); - - // Call bases and write results - System.out.println("Calling bases..."); - - SAMFileHeader sfh = new SAMFileHeader(); - sfh.setSortOrder(SAMFileHeader.SortOrder.queryname); - - unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT; - SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam); - - IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE); - - BasecallingStats bstats = new BasecallingStats(); - - while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) { - RawRead rr = iparser.getRawRead(); - FourProbRead fpr = model.call(rr); - - for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) { - Pair cycleRange = cycleRanges.get(cycleRangeIndex); - - RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond()); - FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond()); - - sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1)); - - if (cycleRangeIndex == 0) { - bstats.update(rrEnd, fprEnd); - bstats.notifyOnInterval(5000); - } - } - } - - bstats.notifyNow(); - - iparser.close(); - sfw.close(); - } else { - unalignedSam = USAM; - } - - if (canAnnotate(SAM_IN)) { - // If we're in annotation mode, annotate the aligned BAM file with the SQ tag - System.out.println("Annotating aligned SAM file..."); - - File alignedSam; - if (ASAM == null || !ASAM.exists()) { - System.out.println(" sorting aligned SAM file by read name..."); - alignedSam = getTempSAMFile("aligned"); - sortBAMByReadName(SAM_IN, alignedSam); - } else { - alignedSam = ASAM; - } - - System.out.println(" merging unaligned and aligned SAM files..."); - File mergedSam = SAM_OUT; - mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam); - } - - System.out.println("Done."); - - return 0; - } - - /** - * Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions. - * - * @param prefix the prefix for the temp file - * @return the temp file - */ - private File getTempSAMFile(String prefix) { - try { - File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile()); - //tempFile.deleteOnExit(); - - // Ensure that the volumes we're about to use are ready. - PathUtils.refreshVolume(tempFile); - PathUtils.refreshVolume(new File(System.getProperty("java.io.tmpdir"))); - - return tempFile; - } catch (IOException e) { - throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent()); - } - } - - /** - * Parse the cycle_ranges string that defines the cycle where a read starts and stops. - * Comma-separated ranges are interpreted to be the first and second end of a pair. - * - * @param cycleRangesString the 0-based, inclusive, comma-separated ranges (i.e. '0-50,51-100') - * @return an ArrayList of cycle ranges - */ - private ArrayList< Pair > getCycleRanges(String cycleRangesString) { - ArrayList< Pair > cycleRanges = new ArrayList< Pair >(); - - String[] pieces = cycleRangesString.split(","); - - Pattern p = Pattern.compile("(\\d+)-(\\d+)"); - - for (String piece : pieces) { - Matcher m = p.matcher(piece); - - if (m.find()) { - Integer cycleStart = new Integer(m.group(1)); - Integer cycleStop = new Integer(m.group(2)); - - cycleRanges.add(new Pair(cycleStart, cycleStop)); - } - } - - if (cycleRanges.size() == 0) { - throw new StingException("At least one cycle range must be specified."); - } - - if (cycleRanges.size() > 2) { - throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2."); - } - - return cycleRanges; - } - - /** - * Simple test to determine whether we're in aligned bam annotation mode or not. - * - * @param samfile the aligned sam file - * @return true if the file exists, false otherwise - */ - private boolean canAnnotate(File samfile) { - return (samfile != null && samfile.exists()); - } - - /** - * Construct a SAMRecord object with the specified information. The secondary bases - * will be annotated suchthat they will not conflict with the primary base. - * - * @param rr the raw Illumina read - * @param fpr the four-base distributions for every base in the read - * @param sfh the SAM header - * @param runBarcode the run barcode of the lane (used to prefix the reads) - * @param isPaired is this a paired-end lane? - * @param isSecondEndOfPair is this the second end of the pair? - * - * @return a fully-constructed SAM record - */ - private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) { - SAMRecord sr = new SAMRecord(sfh); - - sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0"); - sr.setReadUmappedFlag(true); - sr.setReadString(rr.getSequenceAsString()); - sr.setBaseQualities(rr.getQuals()); - - sr.setReadPairedFlag(isPaired); - if (isPaired) { - sr.setMateUnmappedFlag(true); - sr.setFirstOfPairFlag(!isSecondEndOfPair); - } - - sr.setAttribute("SQ", fpr.getSQTag(rr)); - - return sr; - } - - /** - * Resorts a SAM file to queryname order. - * - * @param samFile the input SAM file - * @param sortedSamFile the sorted SAM output file - */ - private void sortBAMByReadName(File samFile, File sortedSamFile) { - SAMFileReader samIn = new SAMFileReader(samFile); - - SAMFileHeader sfh = samIn.getFileHeader(); - sfh.setSortOrder(SAMFileHeader.SortOrder.queryname); - - SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, sortedSamFile); - - for (SAMRecord sr : samIn) { - samOut.addAlignment(sr); - } - - samIn.close(); - samOut.close(); - } - - /** - * Merges two SAM files that have been sorted in queryname order - * - * @param queryNameSortedUnalignedSam the sorted unaligned SAM file - * @param queryNameSortedAlignedSam the sorted aligned SAM file - * @param mergedSam the output file where the merged results should be stored - */ - private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) { - SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam); - SAMFileReader asam = new SAMFileReader(queryNameSortedAlignedSam); - - SAMFileHeader sfh = asam.getFileHeader(); - sfh.setSortOrder(SAMFileHeader.SortOrder.coordinate); - - SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, mergedSam); - - CloseableIterator usamIt = usam.iterator(); - CloseableIterator asamIt = asam.iterator(); - - SAMRecord usr = usamIt.next(); - SAMRecord asr = asamIt.next(); - - int annotatedRecords = 0; - - do { - // Pull a record from the unaligned file and advance the aligned file until we find the matching record. We - // don't have to advance the unaligned file until we find our record because we assume every record we generate - // will be in the aligned file (which also contains unaligned reads). - // - // If Picard ever stops storing the unaligned reads, this logic will need to be rewritten. - System.out.println(asr.getReadString()); - System.out.println(BaseUtils.simpleReverseComplement(asr.getReadString())); - System.out.println(); - - if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) { - byte[] sqtag = (byte[]) usr.getAttribute("SQ"); - String usrread = usr.getReadString(); - String asrread = asr.getReadString(); - - System.out.println(asrread); - - if (asr.getReadNegativeStrandFlag()) { - sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag); - asrread = BaseUtils.simpleReverseComplement(asrread); - - System.out.println(asrread); - } - - if (usrread != null && asrread != null && !usrread.equals(asrread)) { - throw new StingException( - String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'", - usr.getReadName(), usr.getFirstOfPairFlag(), usrread, - asr.getReadName(), asr.getFirstOfPairFlag(), asrread)); - } - - asr.setAttribute("SQ", sqtag); - annotatedRecords++; - - System.out.println("Annotated " + annotatedRecords + " records."); - - usr = usamIt.next(); - } else { - asr = asamIt.next(); - } - - samOut.addAlignment(asr); - } while (usamIt.hasNext() && asamIt.hasNext()); - - usam.close(); - asam.close(); - samOut.close(); - } - - /** - * For debugging purposes. Spits out relevant information for two SAMRecords. - * - * @param sra first SAMRecord - * @param srb second SAMRecord - */ - private void printRecords(SAMRecord sra, SAMRecord srb) { - System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag()); - System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag()); - System.out.println(); - - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingBaseModel.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingBaseModel.java deleted file mode 100755 index bca41b638..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingBaseModel.java +++ /dev/null @@ -1,229 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import cern.colt.matrix.DoubleFactory1D; -import cern.colt.matrix.DoubleFactory2D; -import cern.colt.matrix.DoubleMatrix1D; -import cern.colt.matrix.DoubleMatrix2D; -import cern.colt.matrix.linalg.Algebra; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; - -/** - * BasecallingBaseModel is a class that represents the statistical - * model for all bases at a given cycle. It allows for easy, one - * pass training via the addTrainingPoint() method. Once the model - * is trained, computeLikelihoods will return the probability matrix - * over previous cycle's base hypotheses and current cycle base - * hypotheses (contextual prior is included in these likelihoods). - * - * @author Kiran Garimella - */ -public class BasecallingBaseModel { - private double[][] counts; - private DoubleMatrix1D[][] sums; - private DoubleMatrix2D[][] unscaledCovarianceSums; - - private DoubleMatrix1D[][] means; - private DoubleMatrix2D[][] inverseCovariances; - private double[][] norms; - - private cern.jet.math.Functions F = cern.jet.math.Functions.functions; - private Algebra alg; - - private boolean correctForContext = false; - private int numTheories = 1; - - private boolean readyToCall = false; - private boolean bustedCycle = false; - - /** - * Constructor for BasecallingBaseModel. - * - * @param correctForContext should we attempt to correct for contextual sequence effects? - */ - public BasecallingBaseModel(boolean correctForContext) { - this.correctForContext = correctForContext; - this.numTheories = (correctForContext) ? 4 : 1; - - counts = new double[this.numTheories][4]; - - sums = new DoubleMatrix1D[this.numTheories][4]; - unscaledCovarianceSums = new DoubleMatrix2D[this.numTheories][4]; - - means = new DoubleMatrix1D[this.numTheories][4]; - inverseCovariances = new DoubleMatrix2D[this.numTheories][4]; - norms = new double[this.numTheories][4]; - - for (int basePrevIndex = 0; basePrevIndex < this.numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - sums[basePrevIndex][baseCurIndex] = (DoubleFactory1D.dense).make(4); - unscaledCovarianceSums[basePrevIndex][baseCurIndex] = (DoubleFactory2D.dense).make(4, 4); - - means[basePrevIndex][baseCurIndex] = (DoubleFactory1D.dense).make(4); - inverseCovariances[basePrevIndex][baseCurIndex] = (DoubleFactory2D.dense).make(4, 4); - } - } - - alg = new Algebra(); - } - - - /** - * Add a single training point to the model to estimate the means. - * - * @param probMatrix the matrix of probabilities for the base - * @param fourIntensity the four raw intensities for the base - */ - public void addMeanPoint(double[][] probMatrix, double[] fourIntensity) { - for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - double weight = probMatrix[basePrevIndex][baseCurIndex]; - - DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourIntensity); - weightedChannelIntensities.assign(F.mult(weight)); - - sums[basePrevIndex][baseCurIndex].assign(weightedChannelIntensities, F.plus); - counts[basePrevIndex][baseCurIndex] += weight; - } - } - - readyToCall = false; - } - - /** - * Add a single training point to the model to estimate the covariances. - * - * @param probMatrix the matrix of probabilities for the base - * @param fourIntensity the four raw intensities for the base - */ - public void addCovariancePoint(double[][] probMatrix, double[] fourIntensity) { - for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - double weight = probMatrix[basePrevIndex][baseCurIndex]; - - DoubleMatrix1D mean = sums[basePrevIndex][baseCurIndex].copy(); - mean.assign(F.div(counts[basePrevIndex][baseCurIndex])); - - DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourIntensity); - sub.assign(mean, F.minus); - - DoubleMatrix2D cov = (DoubleFactory2D.dense).make(4, 4); - alg.multOuter(sub, sub, cov); - - cov.assign(F.mult(weight)); - unscaledCovarianceSums[basePrevIndex][baseCurIndex].assign(cov, F.plus); - } - } - - readyToCall = false; - } - - /** - * Precompute all the matrix inversions and determinants we'll need for computing the likelihood distributions. - */ - public void prepareToCallBases() { - for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - means[basePrevIndex][baseCurIndex] = sums[basePrevIndex][baseCurIndex].copy(); - means[basePrevIndex][baseCurIndex].assign(F.div(counts[basePrevIndex][baseCurIndex])); - - inverseCovariances[basePrevIndex][baseCurIndex] = unscaledCovarianceSums[basePrevIndex][baseCurIndex].copy(); - inverseCovariances[basePrevIndex][baseCurIndex].assign(F.div(counts[basePrevIndex][baseCurIndex])); - - if (MathUtils.compareDoubles(alg.det(inverseCovariances[basePrevIndex][baseCurIndex]), 0.0) == 0) { - bustedCycle = true; - readyToCall = true; - - return; - } - - DoubleMatrix2D invcov = alg.inverse(inverseCovariances[basePrevIndex][baseCurIndex]); - - inverseCovariances[basePrevIndex][baseCurIndex] = invcov; - - norms[basePrevIndex][baseCurIndex] = Math.pow(alg.det(invcov), 0.5)/Math.pow(2.0*Math.PI, 2.0); - } - } - - readyToCall = true; - } - - /** - * Compute the likelihood matrix for a base. - * - * @param cycle the cycle we're calling right now - * @param fourintensity the four intensities of the current cycle's base - * @return a 4x4 matrix of likelihoods, where the row is the previous cycle base hypothesis and - * the column is the current cycle base hypothesis - */ - public double[][] computeLikelihoods(int cycle, double[] fourintensity) { - if (!readyToCall) { - prepareToCallBases(); - } - - double[][] likedist = new double[numTheories][4]; - - if (bustedCycle) { - likedist[0][0] = 1.0; - } else { - for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - double norm = norms[basePrevIndex][baseCurIndex]; - - DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourintensity); - sub.assign(means[basePrevIndex][baseCurIndex], F.minus); - - DoubleMatrix1D Ax = alg.mult(inverseCovariances[basePrevIndex][baseCurIndex], sub); - double exparg = -0.5*alg.mult(sub, Ax); - - likedist[basePrevIndex][baseCurIndex] = norm*Math.exp(exparg); - } - } - } - - return likedist; - } - - /** - * Write the model parameters to disk. - * - * @param outparam the file in which the output parameters should be stored - */ - public void write(File outparam) { - try { - PrintWriter writer = new PrintWriter(outparam); - - for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - writer.print("mean_" + BaseUtils.baseIndexToSimpleBase(baseCurIndex) + " = c("); - for (int channel = 0; channel < 4; channel++) { - writer.print(sums[basePrevIndex][baseCurIndex].getQuick(channel)/counts[basePrevIndex][baseCurIndex]); - - if (channel < 3) { - writer.print(", "); - } - } - writer.println(");"); - - DoubleMatrix2D cov = unscaledCovarianceSums[basePrevIndex][baseCurIndex].copy(); - cov.assign(F.div(counts[basePrevIndex][baseCurIndex])); - - writer.print("cov_" + BaseUtils.baseIndexToSimpleBase(baseCurIndex) + " = matrix(c("); - for (int channel1 = 0; channel1 < 4; channel1++) { - for (int channel2 = 0; channel2 < 4; channel2++) { - writer.print(cov.get(channel2, channel1) + (channel1 == 3 && channel2 == 3 ? "" : ",")); - } - } - writer.println("), nr=4, nc=4);\n"); - } - } - - writer.close(); - } catch (IOException e) { - } - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingReadModel.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingReadModel.java deleted file mode 100644 index c27b3c3fb..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingReadModel.java +++ /dev/null @@ -1,263 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.io.File; -import java.util.ArrayList; - -/** - * BasecallingReadModel represents the statistical models for - * all bases in all cycles. It allows for easy training via - * the addTrainingPoint() method, and for the computation of - * the 4x4 likelihood matrix or the 1x4 probability vector - * (with contextual components marginalized out of the - * likelihood matrix). - * - * @author Kiran Garimella - */ -public class BasecallingReadModel { - private BasecallingBaseModel[] basemodels = null; - private boolean correctForContext = true; - - /** - * Constructs a BasecallingReadModel with space for a given read length. - * - * @param readLength the length of the reads to which this model will apply. - */ - public BasecallingReadModel(int readLength) { - initialize(readLength); - } - - /** - * Constructs a BasecallingReadModel and trains it using the specified training data. - * - * @param trainingData a set of RawReads from which the model will be trained. - */ - public BasecallingReadModel(ArrayList trainingData) { - initialize(trainingData.get(0).getReadLength()); - - train(trainingData); - } - - /** - * Initialize the model and set default parameters for each cycle appropriately. - * - * @param readLength the length of the reads to which this model will apply. - */ - public void initialize(int readLength) { - basemodels = new BasecallingBaseModel[readLength]; - - for (int cycle = 0; cycle < readLength; cycle++) { - basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext); - } - } - - /** - * Train the model using the specified training data. - * - * @param trainingData a set of RawReads from which the model will be trained. - */ - public void train(ArrayList trainingData) { - for ( RawRead read : trainingData ) { - addMeanPoints(read); - } - - for ( RawRead read : trainingData ) { - addCovariancePoints(read); - } - } - - /** - * Add a training point for the mean intensity values per base and per cycle. - * - * @param cycle the cycle number (0-based) - * @param probMatrix the probability matrix for the base - * @param fourintensity the four raw intensities for the base - */ - public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) { - basemodels[cycle].addMeanPoint(probMatrix, fourintensity); - } - - /** - * Add a training point for the mean intensity values per base in all cycles. - * - * @param read the raw read - */ - public void addMeanPoints(RawRead read) { - byte[] seqs = read.getSequence(); - byte[] quals = read.getQuals(); - short[][] ints = read.getIntensities(); - - for (int cycle = 0; cycle < seqs.length; cycle++) { - char basePrev = (char) ((cycle == 0) ? '.' : seqs[cycle - 1]); - char baseCur = (char) seqs[cycle]; - double probCur = QualityUtils.qualToProb(quals[cycle]); - - double[][] probMatrix = getBaseProbabilityMatrix(cycle, basePrev, baseCur, probCur); - - double[] fourIntensity = new double[4]; - for (int channel = 0; channel < 4; channel++) { - //fourIntensity[channel] = (double) ints[cycle][channel]; - fourIntensity[channel] = (double) ints[channel][cycle]; - } - - basemodels[cycle].addMeanPoint(probMatrix, fourIntensity); - } - } - - /** - * Add a training point for the intensity covariance matrix per base and per cycle. - * - * @param cycle the cycle number (0-based) - * @param probMatrix the probability matrix for the base - * @param fourintensity the four raw intensities for the base - */ - public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) { - basemodels[cycle].addCovariancePoint(probMatrix, fourintensity); - } - - /** - * Add a training point for the intensity covariance matrix per base in all cycles. - * - * @param read the raw read - */ - public void addCovariancePoints(RawRead read) { - byte[] seqs = read.getSequence(); - byte[] quals = read.getQuals(); - short[][] ints = read.getIntensities(); - - for (int cycle = 0; cycle < seqs.length; cycle++) { - char basePrev = (char) ((cycle == 0) ? '.' : seqs[cycle - 1]); - char baseCur = (char) seqs[cycle]; - double probCur = QualityUtils.qualToProb(quals[cycle]); - - double[][] probMatrix = getBaseProbabilityMatrix(cycle, basePrev, baseCur, probCur); - - double[] fourIntensity = new double[4]; - for (int channel = 0; channel < 4; channel++) { - //fourIntensity[channel] = (double) ints[cycle][channel]; - fourIntensity[channel] = (double) ints[channel][cycle]; - } - - basemodels[cycle].addCovariancePoint(probMatrix, fourIntensity); - } - } - - /** - * Compute the likelihoods that a given set of intensities yields each possible base. - * - * @param cycle the cycle number (0-based) - * @param fourintensity the four raw intensities for the base - * @return the matrix of likelihoods - */ - public double[][] computeLikelihoods(int cycle, double[] fourintensity) { - return basemodels[cycle].computeLikelihoods(cycle, fourintensity); - } - - /** - * Compute the probabilities that a given set of intensities yields each possible base. - * - * @param cycle the cycle number (0-based) - * @param basePrev the previous base - * @param qualPrev the previous base's quality score - * @param fourintensity the four raw intensities for the base - * @return the probability distribution over the four base possibilities - */ - public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { - double[][] likes = computeLikelihoods(cycle, fourintensity); - - double total = 0; - - for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - double prior = 1.0; - if (correctForContext) { - double prob = QualityUtils.qualToProb(qualPrev); - if (basePrevIndex == BaseUtils.simpleBaseToBaseIndex(basePrev)) { - prior = prob; - } else { - prior = (1.0 - prob)/((double) (4*likes.length - 1)); - } - } - likes[basePrevIndex][baseCurIndex] = prior*likes[basePrevIndex][baseCurIndex]; - total += likes[basePrevIndex][baseCurIndex]; - } - } - - for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - likes[basePrevIndex][baseCurIndex] /= total; - } - } - - return new FourProb(likes); - } - - /** - * Call the bases in the given RawRead. - * - * @param read the RawRead - * @return the basecalled read - */ - public FourProbRead call(RawRead read) { - FourProbRead fpr = new FourProbRead(read.getReadLength()); - - for (int cycle = 0; cycle < read.getReadLength(); cycle++) { - char basePrev = (char) ((cycle == 0) ? '.' : read.getSequence()[cycle - 1]); - byte qualPrev = ((cycle == 0) ? 0 : read.getQuals()[cycle - 1]); - - double[] fourIntensity = new double[4]; - for (int channel = 0; channel < 4; channel++) { - //fourIntensity[channel] = (double) read.getIntensities()[cycle][channel]; - fourIntensity[channel] = (double) read.getIntensities()[channel][cycle]; - } - - fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity)); - - } - - return fpr; - } - - /** - * Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability. - * - * @param cycle the cycle number (0-based) - * @param basePrev the previous base - * @param baseCur the current base - * @param probCur the probability of the current base - * @return the probability matrix of the base - */ - public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) { - double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4]; - - int actualBasePrevIndex = (correctForContext && cycle > 0) ? BaseUtils.simpleBaseToBaseIndex(basePrev) : 0; - int actualBaseCurIndex = BaseUtils.simpleBaseToBaseIndex(baseCur); - - if (actualBasePrevIndex == -1) { actualBasePrevIndex = BaseUtils.getRandomBaseIndex(); } - if (actualBaseCurIndex == -1) { actualBaseCurIndex = BaseUtils.getRandomBaseIndex(); } - - double residualTheories = (double) (dist.length*dist[0].length - 1); - - for (int basePrevIndex = 0; basePrevIndex < dist.length; basePrevIndex++) { - for (int baseCurIndex = 0; baseCurIndex < dist[basePrevIndex].length; baseCurIndex++) { - dist[basePrevIndex][baseCurIndex] = (basePrevIndex == actualBasePrevIndex && baseCurIndex == actualBaseCurIndex) ? probCur : ((1.0 - probCur)/residualTheories); - } - } - - return dist; - } - - /** - * Write model parameters to disk. - * - * @param dir the directory in which model parameters should be stored. - */ - public void write(File dir) { - for (int cycle = 0; cycle < basemodels.length; cycle++) { - File outparam = new File(dir.getPath() + "/param." + cycle + ".r"); - basemodels[cycle].write(outparam); - } - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingStats.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingStats.java deleted file mode 100755 index 8b5c5c5b2..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingStats.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import org.broadinstitute.sting.utils.BaseUtils; - -/** - * BasecallingStats is a utility class to aggregate and emit basecalling - * stats (total bases seen and consistency between basecalling methods). - * - * @author Kiran Garimella - */ -public class BasecallingStats { - private int basesConsistent = 0; - private int basesTotal = 0; - private int readsTotal = 0; - - /** - * Constructor that does nothing. - */ - public BasecallingStats() {} - - /** - * Return the number of bases called identically by two different methods. - * - * @return the number of consistent bases. - */ - public int getBasesConsistent() { - return basesConsistent; - } - - /** - * Return the total number of bases seen. - * - * @return the total number of bases seen. - */ - public int getBasesTotal() { - return basesTotal; - } - - /** - * Return the total number of reads seen. - * - * @return the total number of reads seen. - */ - public int getReadsTotal() { - return readsTotal; - } - - /** - * Return the percent of bases called consistently by two different methods. - * - * @return the percent of bases called consistently - */ - public double getPercentConsistent() { - return 100.0*((double) getBasesConsistent())/((double) getBasesTotal()); - } - - /** - * Updates the number of bases seen, the number of reads seen, and the number of consistent bases. - * - * @param rr the raw Illumina read - * @param fpr the FourProb read - */ - public void update(RawRead rr, FourProbRead fpr) { - for (int cycle = 0; cycle < fpr.size(); cycle++) { - int rawBaseIndex = BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]); - int fpBaseIndex = fpr.get(cycle).indexAtRank(0); - - if (rawBaseIndex >= 0 && fpBaseIndex >= 0) { - basesTotal++; - - if (rawBaseIndex == fpBaseIndex) { - basesConsistent++; - } - } - } - - readsTotal++; - } - - /** - * Returns basecalling stats info in a nicely formatted string. - * - * @return nicely formatted string containing basecalling stats - */ - public String toString() { - return String.format(" Reads seen: %d; %% bases consistent: %d/%d (%2.2f%%)", getReadsTotal(), getBasesConsistent(), getBasesTotal(), getPercentConsistent()); - } - - /** - * Periodically print a line containing basecalling stats after having seen a certain number of reads. - * - * @param interval the periodicity (in number of reads) of the messages given in number of bases observed - */ - public void notifyOnInterval(int interval) { - if (getReadsTotal() > 0 && getReadsTotal() % interval == 0) { - System.out.printf("%s\n", toString()); - } - } - - /** - * Immediately print a line containing basecalling stats. - */ - public void notifyNow() { - System.out.printf("%s\n", toString()); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingTrainer.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingTrainer.java deleted file mode 100755 index c4dd0a424..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/BasecallingTrainer.java +++ /dev/null @@ -1,275 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.CloseableIterator; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.reference.ReferenceSequence; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.StingException; - -import java.io.File; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Vector; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * BasecallingTrainingSet holds a set of raw read sequences, their raw intensities, and quality scores. - * - * @author Kiran Garimella - */ -public class BasecallingTrainer { - private File bustardDir; - private int lane; - private int trainingLimit; - - private ArrayList trainingData; - - /** - * Constructor for BasecallingTrainingSet. - * - * @param bustardDir the Bustard directory for the sample - * @param lane the lane for the sample - * @param trainingLimit the number of training reads to accept - */ - public BasecallingTrainer(File bustardDir, int lane, int trainingLimit) { - this.bustardDir = bustardDir; - this.lane = lane; - this.trainingLimit = trainingLimit; - } - - /** - * Get the training data array list. - * - * @return the arraylist of raw training reads - */ - public ArrayList getTrainingData() { - return this.trainingData; - } - - /** - * Set the training data array list. - * - * @param trainingData the arraylist of raw training reads - */ - public void setTrainingData(ArrayList trainingData) { - this.trainingData = trainingData; - } - - /** - * Take the first N reads that have no ambiguous bases, an average quality score greater - * than or equal to 15, and are not largely homopolymers and add them to the training set. - */ - public void loadFirstNReasonableReadsTrainingSet() { - this.trainingData = new ArrayList(trainingLimit); - - IlluminaParser iparser = new IlluminaParser(bustardDir, lane); - - RawRead rawread; - int numreads = 0; - - while (numreads < trainingLimit && iparser.next()) { - rawread = iparser.getRawRead(); - - int numAmbiguous = 0; - byte[] sequence = rawread.getSequence(); - - for ( byte byteBase : sequence ) { - if (BaseUtils.simpleBaseToBaseIndex((char) byteBase) == -1) { - numAmbiguous++; - } - } - - if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) { - trainingData.add(rawread); - numreads++; - } - } - } - - /** - * Take the first N reads that have no ambiguous bases, an average quality score greater - * than or equal to 15, and are not largely homopolymers and add them to the training set. - * - * @param bustardDir the bustard directory - * @param lane the lane number - * @param trainingLimit how many reads should we use to train? - */ - public static void loadNReasonableReadsTrainingSet(File bustardDir, int lane, int trainingLimit) { - ArrayList trainingData = new ArrayList(trainingLimit); - - IlluminaParser iparser = new IlluminaParser(bustardDir, lane); - - RawRead rawread; - int numreads = 0; - - while (numreads < trainingLimit && iparser.next()) { - rawread = iparser.getRawRead(); - - int numAmbiguous = 0; - byte[] sequence = rawread.getSequence(); - - for ( byte byteBase : sequence ) { - if (BaseUtils.simpleBaseToBaseIndex((char) byteBase) == -1) { - numAmbiguous++; - } - } - - if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) { - trainingData.add(rawread); - numreads++; - } - } - } - - /** - * Return the average quality score of a raw read. - * - * @param read the raw read - * @return the average quality score - */ - private static double getAverageQualityScore(RawRead read) { - double averageQual = 0; - - for ( byte qual : read.getQuals() ) { - averageQual += qual; - } - - return averageQual / ((double) read.getReadLength()); - } - - /** - * Load a training set from perfect reads in an already-aligned bam file. - * - * @param samIn the SAM/BAM file to load the reads from - * @param reference the reference to which the reads should be compared - */ - public void loadPreAlignedTrainingSet(File samIn, File reference) { - Vector< HashMap > trainingReads = getPerfectAlignmentsByTile(samIn, reference); - - trainingData = correlateReadsAndIntensities(trainingReads); - } - - /** - * Find perfect reads and group them by tile. - * - * @param samIn the SAM/BAM file to load the raeds from - * @param reference the reference to which the reads should be compared - * @return a vector of perfect reads, grouped by tile - */ - private Vector> getPerfectAlignmentsByTile(File samIn, File reference) { - ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference); - String currentContig = "none"; - byte[] refbases = null; - - SAMFileReader sf = new SAMFileReader(samIn); - SAMRecord sr; - CloseableIterator sfit = sf.iterator(); - - Vector< HashMap > trainingReads = new Vector< HashMap >(101); - int numTrainingReads = 0; - - while (numTrainingReads < trainingLimit && (sr = sfit.next()) != null) { - if (sr.getCigar().numCigarElements() == 1) { - int offset = sr.getAlignmentStart(); - - if (!currentContig.matches(sr.getReferenceName())) { - ReferenceSequence refSeq = ref.nextSequence(); - while( !refSeq.getName().equals(sr.getReferenceName()) ) - refSeq = ref.nextSequence(); - currentContig = sr.getReferenceName(); - refbases = refSeq.getBases(); - } - - int mismatches = 0; - - String refString = ""; - for (int i = offset, j = 0; i < offset + sr.getReadBases().length; i++, j++) { - refString += (char) refbases[i - 1]; - - mismatches += (BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]) != - BaseUtils.simpleBaseToBaseIndex((char) sr.getReadBases()[j])) - ? 1 : 0; - } - - if (mismatches == 0) { - Pattern p = Pattern.compile(":(\\d):(\\d+):(\\d+):(\\d+)#"); - Matcher m = p.matcher(sr.getReadName()); - - if (m.find()) { - int tile = Integer.valueOf(m.group(2)); - String readKey = String.format("%s:%s:%s:%s", m.group(1), m.group(2), m.group(3), m.group(4)); - - if (tile > trainingReads.size()) { - trainingReads.setSize(tile + 1); - } - - if (trainingReads.get(tile) == null) { - trainingReads.set(tile, new HashMap()); - } - - trainingReads.get(tile).put(readKey, sr); - numTrainingReads++; - } else { - throw new StingException("Pattern '" + p.pattern() + "' does not match against read name '" + sr.getReadName() + "'"); - } - } - } - } - - return trainingReads; - } - - /** - * Correlate the perfect reads with their intensities (at least, theoretically). This doesn't work right now... - * - * @param trainingReads the set of training reads, hashed by tile - * @return the final training set with intensities added in - */ - private ArrayList correlateReadsAndIntensities(Vector> trainingReads) { - ArrayList newTrainingData = new ArrayList(trainingLimit); - - IlluminaParser iparser = new IlluminaParser(bustardDir, lane); - - int totalReadCount = 0; - - for (int tile = 1; tile < trainingReads.size(); tile++) { - iparser.seekToTile(tile); - - int tileReadCount = 0; - - RawRead iread; - while (trainingReads.get(tile) != null && tileReadCount < trainingReads.get(tile).size() && iparser.next()) { - iread = iparser.getRawRead(); - String readKey = iread.getReadKey(); - - if (trainingReads.get(tile).containsKey(readKey)) { - System.out.printf("\tTile %d: found %d of %d (%4.4f in tile, %4.4f total) \r", - tile, - tileReadCount, - trainingReads.get(tile).size(), - ((double) tileReadCount)/((double) trainingReads.get(tile).size()), - ((double) totalReadCount)/((double) trainingLimit)); - - byte[] quals = new byte[iread.getReadLength()]; - for (int qualIndex = 0; qualIndex < quals.length; qualIndex++) { - quals[qualIndex] = 40; - } - - iread.setQuals(quals); - newTrainingData.add(iread); - - tileReadCount++; - totalReadCount++; - } - } - } - - iparser.close(); - - return newTrainingData; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/CombineSamAndFourProbs.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/CombineSamAndFourProbs.java deleted file mode 100755 index d2805337a..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/CombineSamAndFourProbs.java +++ /dev/null @@ -1,119 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; - -public class CombineSamAndFourProbs extends CommandLineProgram { - public static CombineSamAndFourProbs Instance = null; - - @Argument(fullName="sam", shortName="S", doc="Input SAM file") - public File SAM; - @Argument(fullName="fourprob", shortName="F", doc="Input text file := read_name sq_field") - public File FOURPROBS; - @Argument(fullName="sam_out", shortName="O", doc="Output SAM file") - public File SAM_OUT; - - public static void main(String[] argv) { - Instance = new CombineSamAndFourProbs(); - start(Instance, argv); - } - - protected int execute() { - BufferedReader fpreader = null; - - try { - fpreader = new BufferedReader(new FileReader(FOURPROBS)); - - HashMap fourprobMap = new HashMap(27000000); - - String fourprobLine; - int processed = 0; - while ((fourprobLine = fpreader.readLine()) != null) { - String[] fourprobPieces = fourprobLine.split("\\s+"); - String[] sqfield = fourprobPieces[1].split(":"); - byte[] sq = hexStringToBytes(sqfield[2]); - - fourprobMap.put(fourprobPieces[0], sq); - - if (processed % 1000000 == 0) { - System.out.println("Processed " + processed); - } - processed++; - } - - fpreader.close(); - - SAMFileReader sf = new SAMFileReader(SAM); - sf.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sf.getFileHeader(), true, SAM_OUT); - - for (SAMRecord sr : sf) { - String readname = sr.getReadName(); - byte[] sq = (byte[]) fourprobMap.get(readname); - - if (sq != null) { - if (sr.getReadNegativeStrandFlag()) { - sq = QualityUtils.reverseComplementCompressedQualityArray(sq); - } - - sr.setAttribute("SQ", sq); - sw.addAlignment(sr); - } - } - - sf.close(); - sw.close(); - } catch (IOException e) { - System.err.println("There was an error."); - System.exit(1); - } - - return 0; - } - - static String bytesToHexString(final byte[] data) { - final char[] chars = new char[2 * data.length]; - for (int i = 0; i < data.length; i++) { - final byte b = data[i]; - chars[2*i] = toHexDigit((b >> 4) & 0xF); - chars[2*i+1] = toHexDigit(b & 0xF); - } - return new String(chars); - } - - static byte[] hexStringToBytes(final String s) throws NumberFormatException { - if (s.length() % 2 != 0) { - throw new NumberFormatException("Hex representation of byte string does not have even number of hex chars: " + s); - } - final byte[] ret = new byte[s.length() / 2]; - for (int i = 0; i < ret.length; ++i) { - ret[i] = (byte) ((fromHexDigit(s.charAt(i * 2)) << 4) | fromHexDigit(s.charAt(i * 2 + 1))); - } - return ret; - } - - private static char toHexDigit(final int value) { - return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10)); - } - - private static int fromHexDigit(final char c) throws NumberFormatException { - final int ret = Character.digit(c, 16); - if (ret == -1) { - throw new NumberFormatException("Not a valid hex digit: " + c); - } - return ret; - } -} - diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFileParser.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFileParser.java deleted file mode 100644 index 856bc1a05..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFileParser.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package org.broadinstitute.sting.secondarybase; - -import net.sf.picard.PicardException; -import edu.mit.broad.picard.util.BasicTextFileParser; -import net.sf.picard.util.FormatUtil; - -import java.io.File; - -/** - * Class to parse the data in an Illumina Firecrest directory and return an iterator over that data, in order - * by tile. - * - * @author Kiran Garimella - */ -public class FirecrestFileParser extends AbstractFirecrestFileParser { - - private BasicTextFileParser parser; - private final FormatUtil formatter = new FormatUtil(); - private final File[] intensityFiles; - - /** - * Constructor - * - * @param firecrestDirectory directory where the Firecrest files can be located - * @param lane the lane to parse - */ - public FirecrestFileParser(final File firecrestDirectory, final int lane) { - super(firecrestDirectory, lane); - intensityFiles = getFilesMatchingRegexp("s_" + lane + "_\\d{4}_int.txt(.gz)?"); - } - - @Override - public boolean isValidFirecrestDirectory() { - return (intensityFiles.length > 0); - } - - /** - * Sorts the relevant files in the firecrestDirectory. Does some basic sanity checking to ensure that some files - * are found and that they are the expected multiple for paired-end or not. - */ - @Override - protected void prepareToIterate() { - // Some basic sanity checking on file counts - if (intensityFiles.length == 0) { - throw new PicardException("No Firecrest 1.3 intensity files found in " + firecrestDirectory.getAbsolutePath() + " for lane " + lane); - } - - // Sort each set of reads and create a text parser for it - parser = makeParserForTextFiles(true, intensityFiles); - } - - /** - * Parses the next line from the parser and constructs a FirecrestReadData object from it - * The first 4 fields are position information for the read, and the remaining value are - * the intensities data. - * - * @return a fully populated FirecrestReadData object - */ - protected FirecrestReadData readNext() { - if (!parser.hasNext()) { - return null; - } - final String[] data = parser.next(); - final int lane = formatter.parseInt(data[0]); - final int tile = formatter.parseInt(data[1]); - final int x = formatter.parseInt(data[2]); - final int y = formatter.parseInt(data[3]); - - int intensityOffset = 4; - int numIntensities = (data.length - 4)/4; - - double[][] intensities = new double[numIntensities][4]; - - for (int cycle = 0, index = intensityOffset; cycle < numIntensities; cycle++) { - for (int channel = 0; channel < 4; channel++, index++) { - intensities[cycle][channel] = formatter.parseFloat(data[index]); - } - } - - return new FirecrestReadData(lane, tile, x, y, intensities); - } - - /** - * Closes the underlying PasteParser - */ - @Override - public void close() { - if (parser != null) { - parser.close(); - } - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFilenameComparator.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFilenameComparator.java deleted file mode 100644 index 34a6e79fd..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestFilenameComparator.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -//package edu.mit.broad.picard.illumina; -package org.broadinstitute.sting.secondarybase; - -import java.io.File; -import java.util.Comparator; - -/** - * Comparator for getting Firecrest files in "sorted" order for use by the FirecrestFileParser. Expected order is - * by lane in ascending order, then by tile in ascending order. - * - * IMPORTANT: Currently this class expects to receive ONLY int files. - * - * @author Kiran Garimella - */ -public class FirecrestFilenameComparator implements Comparator { - - /** - * Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as - * the first argument is less than, equal to, or greater than the second. - * - * @param file1 - * @param file2 - * @return a negative integer, zero, or a positive integer as - * the first argument is less than, equal to, or greater than the second. - */ - public int compare(File file1, File file2) - { - Integer parts1[] = parseFileNameParts(file1.getName()); - Integer parts2[] = parseFileNameParts(file2.getName()); - - for (int i = 1; i < parts1.length; i++) - { - if (!parts1[i].equals(parts2[i])) { - return parts1[i].compareTo(parts2[i]); - } - } - return 0; - } - - /** - * Utility method that returns an array of integers that represent, in order, - * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) - * represented by the given file name - * - * @param name - * @return an array of integers that represent, in order, - * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) - * represented by the given file name - */ - private Integer[] parseFileNameParts(String name) - { - Integer parts[] = new Integer[3]; // Lane, tile, read - String src[] = name.split("_"); - parts[0] = new Integer(src[1]); // Lane is always the second part - if (src[2].length() == 4) { // Tile is 3rd or fourth - parts[1] = new Integer(src[2]); - } - else { - parts[1] = new Integer(src[3]); - } - if (src[2].length() == 1) { // read is last - parts[2] = new Integer(src[2]); - } - return parts; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestReadData.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestReadData.java deleted file mode 100644 index 74970844f..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FirecrestReadData.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package org.broadinstitute.sting.secondarybase; - -/** - * Holds all the Firecrest-level data we need (so far) about an individual read. - * - * @author Kiran Garimella - */ -public class FirecrestReadData { - final private int laneNumber; - final private int tileNumber; - final private int xCoordinate; - final private int yCoordinate; - final private double[][] intensities; - - - /** - * Constructor that takes everything to populate this object - * - * @param laneNumber - * @param tileNumber - * @param xCoordinate - * @param yCoordinate - * @param intensities - */ - public FirecrestReadData(int laneNumber, int tileNumber, int xCoordinate, int yCoordinate, double[][] intensities) { - this.laneNumber = laneNumber; - this.tileNumber = tileNumber; - this.xCoordinate = xCoordinate; - this.yCoordinate = yCoordinate; - this.intensities = intensities; - } - - /** - * Composes a name for this read from its values. - * - * @return the read name - */ - public String getReadName() { - return this.laneNumber + ":" + this.tileNumber + ":" + this.xCoordinate + ":" + this.yCoordinate + "#0"; - } - - public int getLaneNumber() { return laneNumber; } - public int getTileNumber() { return tileNumber; } - public int getXCoordinate() { return xCoordinate; } - public int getYCoordinate() { return yCoordinate; } - public double[][] getIntensities() { return intensities; } - -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourIntensity.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourIntensity.java deleted file mode 100755 index 064031f00..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourIntensity.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -public class FourIntensity { - private float[] fIntensities; - - public FourIntensity() { - fIntensities = new float[4]; - } - - public FourIntensity(float[] fIntensities) { - this.fIntensities = fIntensities; - } - - public FourIntensity(FourIntensity intensity) { - fIntensities = new float[4]; - - for (int channel = 0; channel < 4; channel++) { - fIntensities[channel] = intensity.getChannelIntensity(channel); - } - } - - public void add(FourIntensity intensity) { - for (int channel = 0; channel < 4; channel++) { - fIntensities[channel] += intensity.getChannelIntensity(channel); - } - } - - public void subtract(FourIntensity intensity) { - for (int channel = 0; channel < 4; channel++) { - fIntensities[channel] -= intensity.getChannelIntensity(channel); - } - } - - public void divide(float divisor) { - for (int channel = 0; channel < 4; channel++) { - fIntensities[channel] /= divisor; - } - } - - public float getChannelIntensity(int channel) { return fIntensities[channel]; } - - public int brightestChannel() { - int brightest = 0; - - for (int channel = 1; channel < 4; channel++) { - if (fIntensities[channel] > fIntensities[brightest]) { - brightest = channel; - } - } - - return brightest; - } - - public String toString() { - return "(" + getChannelIntensity(0) + - ", " + getChannelIntensity(1) + - ", " + getChannelIntensity(2) + - ", " + getChannelIntensity(3) + - ")"; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProb.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProb.java deleted file mode 100755 index 49e4a9d04..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProb.java +++ /dev/null @@ -1,104 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.QualityUtils; - -/** - * FourProb represents four base hypotheses, their probabilities, and the ranking among one another. - * - * @author Kiran Garimella - */ -public class FourProb { - private double[] baseProbs; - private int[] baseIndices; - - /** - * Constructor for FourProb. - * - * @param baseLikes the unsorted base hypothesis probabilities (in ACGT order). - */ - public FourProb(double[][] baseLikes) { - double[] baseProbs = new double[4]; - for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { - for (int basePrevIndex = 0; basePrevIndex < baseLikes.length; basePrevIndex++) { - baseProbs[baseCurIndex] += baseLikes[basePrevIndex][baseCurIndex]; - } - } - - int[] baseIndices = {0, 1, 2, 3}; - - Integer[] perm = Utils.SortPermutation(baseProbs); - double[] ascendingBaseProbs = Utils.PermuteArray(baseProbs, perm); - int[] ascendingBaseIndices = Utils.PermuteArray(baseIndices, perm); - - this.baseProbs = new double[4]; - this.baseIndices = new int[4]; - - for (int i = 0; i < 4; i++) { - this.baseProbs[i] = ascendingBaseProbs[3 - i]; - this.baseIndices[i] = ascendingBaseIndices[3 - i]; - } - } - - /** - * Returns the index of the base at the specified rank. - * - * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned - * @return the index (0, 1, 2, 3). - */ - public int indexAtRank(int rank) { return baseIndices[rank]; } - - /** - * Returns the base label of the base at the specified rank. - * - * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned - * @return the base label (A, C, G, T). - */ - public char baseAtRank(int rank) { return baseIndexToBase(indexAtRank(rank)); } - - /** - * Returns the probability of the base at the specified rank. - * - * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned - * @return the probability of the base (0.0-1.0) - */ - public double probAtRank(int rank) { return baseProbs[rank]; } - - /** - * Returns the quality score of the base at the specified rank. - * - * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned - * @return the quality score of the base (0-40) - */ - public byte qualAtRank(int rank) { return QualityUtils.probToQual(probAtRank(rank)); } - - /** - * A utility method to convert a base index into a base label. - * - * @param baseIndex the index of the base (0, 1, 2, 3). - * @return A, C, G, T, or '.' if the base index can't be understood. - */ - private char baseIndexToBase(int baseIndex) { - switch (baseIndex) { - case 0: return 'A'; - case 1: return 'C'; - case 2: return 'G'; - case 3: return 'T'; - default: return '.'; - } - } - - /** - * Prettily formats the FourProb info. - * - * @return a prettily formatted Sting containing the base and quality score in rank order. - */ - public String toString() { - return ( - "[" + baseAtRank(0) + ":" + qualAtRank(0) + " " - + baseAtRank(1) + ":" + qualAtRank(1) + " " - + baseAtRank(2) + ":" + qualAtRank(2) + " " - + baseAtRank(3) + ":" + qualAtRank(3) + "]" - ); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProbRead.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProbRead.java deleted file mode 100755 index c5e74a1c4..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/FourProbRead.java +++ /dev/null @@ -1,96 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.ArrayList; - -/** - * FourProbRead contains the four-prob information for each base in a read. - */ -public class FourProbRead extends ArrayList { - /** - * Initialize the container with the specified capacity. - * - * @param initialCapacity the number of bases in the read - */ - public FourProbRead(int initialCapacity) { - super(initialCapacity); - } - - /** - * Returns a subset of the FourProbRead. - * - * @param cycleStart the starting cycle (0-based, inclusive) - * @param cycleStop the ending cycle (0-based, inclusive) - * @return a FourProbRead that is a subset of this FourProbRead - */ - public FourProbRead getSubset(int cycleStart, int cycleStop) { - FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1); - - for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) { - subset.add(offset, this.get(cycle)); - } - - return subset; - } - - /** - * Get the read sequence at a specified rank. - * - * @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best) - * @return the read sequence at the specified rank - */ - public String getBaseSequenceAtGivenRank(int rank) { - String pseq = ""; - - for ( FourProb fp : this ) { - pseq += fp.baseAtRank(rank); - } - - return pseq; - } - - /** - * Get the primary read sequence. - * - * @return the primary read sequence - */ - public String getPrimaryBaseSequence() { - return getBaseSequenceAtGivenRank(0); - } - - /** - * Get the secondary read sequence. - * - * @return the secondary read sequence - */ - public String getSecondaryBaseSequence() { - return getBaseSequenceAtGivenRank(1); - } - - /** - * Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file. - * - * @param rr the raw read - * @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2) - */ - public byte[] getSQTag(RawRead rr) { - byte[] sqtag = new byte[this.size()]; - - for (int cycle = 0; cycle < this.size(); cycle++) { - FourProb fp = this.get(cycle); - - int fpPrimaryBaseIndex = fp.indexAtRank(0); - int rawPrimaryBaseIndex = BaseUtils.simpleBaseToBaseIndex(rr.getSequenceAsString().charAt(cycle)); - - int fpSecondaryBaseIndex = (fpPrimaryBaseIndex == rawPrimaryBaseIndex) ? fp.indexAtRank(1) : fpPrimaryBaseIndex; - - double qualdiff = -10.0*Math.log10(fp.probAtRank(2)/fp.probAtRank(1)); - - sqtag[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(fpSecondaryBaseIndex, qualdiff); - } - - return sqtag; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/IlluminaParser.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/IlluminaParser.java deleted file mode 100755 index c25fe695e..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/IlluminaParser.java +++ /dev/null @@ -1,234 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import edu.mit.broad.picard.util.BasicTextFileParser; -import edu.mit.broad.picard.util.PasteParser; -import org.broadinstitute.sting.utils.StingException; - -import java.io.Closeable; -import java.io.File; -import java.io.FilenameFilter; -import java.util.Arrays; -import java.util.Comparator; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * IlluminaParser parses raw Illumina data (raw intensities, basecalled read sequences and quality scores) - * and presents it to the developer in an easy-to-use form. It also permits random tile jumping. - * - * WARNING: This parser does not understand newer GAPipeline data formats, and instead relies on the older - * data formats that may have been generated with older Illumina tools. As a result, this may return - * suboptimal data. This parser only exists temporarily until the Picard team writes a much more sensible - * version. Proceed with caution. - * - * @author Kiran Garimella - */ -public class IlluminaParser implements Closeable { - private File bustardDir; - private File firecrestDir; - private int lane; - - private File[] intfiles; - private File[] seqfiles; - private File[] prbfiles; - - private int currentTileIndex; - private PasteParser currentTileParser; - private String[][] currentParseResult; - - /** - * Construct an IlluminaParser given the Bustard directory and lane. Infer the Firecrest directory. - * - * @param bustardDir the Illumina Bustard directory - * @param lane the Illumina lane - */ - public IlluminaParser(File bustardDir, int lane) { - this.bustardDir = bustardDir; - this.firecrestDir = bustardDir.getParentFile(); - this.lane = lane; - - initializeParser(); - } - - /** - * Construct an IlluminaParser given the Bustard directory, Firecrest directory and lane. - * - * @param bustardDir the Illumina Bustard directory - * @param firecrestDir the Illumina Firecrest directory - * @param lane the Illumina lane - */ - public IlluminaParser(File bustardDir, File firecrestDir, int lane) { - this.bustardDir = bustardDir; - this.firecrestDir = firecrestDir; - this.lane = lane; - - initializeParser(); - } - - /** - * Initialize the parser and seek to the first tile. - */ - private void initializeParser() { - intfiles = firecrestDir.listFiles(getFilenameFilter("int")); - seqfiles = bustardDir.listFiles(getFilenameFilter("seq")); - prbfiles = bustardDir.listFiles(getFilenameFilter("prb")); - - if (intfiles.length != seqfiles.length || intfiles.length != prbfiles.length || seqfiles.length != prbfiles.length) { - throw new StingException( - String.format("File list lengths are unequal (int:%d, seq:%d, prb:%d)", - intfiles.length, - seqfiles.length, - prbfiles.length) - ); - } - - Arrays.sort(intfiles, getTileSortingComparator()); - Arrays.sort(seqfiles, getTileSortingComparator()); - Arrays.sort(prbfiles, getTileSortingComparator()); - - seekToTile(1); - - // Todo: put some more consistency checks here - } - - /** - * Get the filename filter for files of a given type. - * - * @param suffix the type (i.e. 'int', 'seq', 'prb'). - * @return the filename filter - */ - private FilenameFilter getFilenameFilter(final String suffix) { - return new FilenameFilter() { - public boolean accept(File file, String s) { - Pattern pseq = Pattern.compile(String.format("s_%d_\\d+_%s\\.txt(?!.+old.+)(\\.gz)?", lane, suffix)); - Matcher mseq = pseq.matcher(s); - - return mseq.find(); - } - }; - } - - /** - * Get a comparator that sorts by tile. - * - * @return the comparator that sorts by tile. - */ - private Comparator getTileSortingComparator() { - return new Comparator() { - public int compare(File file1, File file2) { - Pattern ptile = Pattern.compile(String.format("s_%d_(\\d+)_", lane)); - - Matcher mtile1 = ptile.matcher(file1.getName()); - Matcher mtile2 = ptile.matcher(file2.getName()); - - if (mtile1.find() && mtile2.find()) { - int tile1 = Integer.valueOf(mtile1.group(1)); - int tile2 = Integer.valueOf(mtile2.group(1)); - - if (tile1 < tile2) { return -1; } - else if (tile1 > tile2) { return 1; } - - return 0; - } - - throw new StingException("Tile filenames ('" + file1.getName() + "' or '" + file2.getName() + "') did not match against regexp pattern ('" + ptile.pattern() + "')"); - } - }; - } - - /** - * Return the number of tiles. - * - * @return the number of tiles. - */ - public int numTiles() { return intfiles.length; } - - /** - * Seek to a specified tile. - * - * @param tile the tile to which we should seek - * @return true if we were able to seek to the tile, false if otherwise - */ - public boolean seekToTile(int tile) { - if (tile < intfiles.length - 1) { - currentTileIndex = tile - 1; - - BasicTextFileParser intparser = new BasicTextFileParser(true, intfiles[currentTileIndex]); - BasicTextFileParser seqparser = new BasicTextFileParser(true, seqfiles[currentTileIndex]); - BasicTextFileParser prbparser = new BasicTextFileParser(true, prbfiles[currentTileIndex]); - - currentTileParser = new PasteParser(intparser, seqparser, prbparser); - - return true; - } - - return false; - } - - /** - * Returns whether the parser has any more data to go through. - * - * @return true if there's data left, false if otherwise - */ - public boolean hasNext() { - return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1)); - } - - /** - * Advance the parser to the next read. - * - * @return true if successful, false if otherwise - */ - public boolean next() { - if (hasNext()) { - currentParseResult = currentTileParser.next(); - - return true; - } - - return false; - } - - /** - * Returns the result from the current parse as an matrix of Strings. - * - * @return the matrix of Strings containing the current parse result - */ - public String[][] getCurrentParseResult() { - return currentParseResult; - } - - /** - * Removes, um, something, but in reality, does nothing. - */ - public void remove() { - throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported."); - } - - /** - * Close the current tile. - */ - public void close() { - currentTileParser.close(); - } - - /** - * Returns a raw read containing the raw intensities, read sequence, and quality scores. - * - * @return the raw read - */ - public RawRead getRawRead() { - return getSubset(0, currentParseResult[1][4].length() - 1); - } - - /** - * Returns a subset of the current parse result as a raw read. - * - * @param cycleStart the starting cycle for the desired subset - * @param cycleStop the ending cycle for the desired subset - * @return the subset of the current parse result as a raw read - */ - public RawRead getSubset(int cycleStart, int cycleStop) { - return new RawRead(currentParseResult, cycleStart, cycleStop); - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/MatchSQTagToStrand.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/MatchSQTagToStrand.java deleted file mode 100644 index 6789b0fcc..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/MatchSQTagToStrand.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; - -public class MatchSQTagToStrand extends CommandLineProgram { - public static MatchSQTagToStrand Instance = null; - - @Argument(fullName="sam_in", shortName="I", doc="Input SAM file") - public File SAM_IN; - @Argument(fullName="sam_out", shortName="O", doc="Output SAM file") - public File SAM_OUT; - - public static void main(String[] argv) { - Instance = new MatchSQTagToStrand(); - start(Instance, argv); - } - - protected int execute() { - SAMFileReader sf = new SAMFileReader(SAM_IN); - sf.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sf.getFileHeader(), true, SAM_OUT); - - for (SAMRecord sr : sf) { - if (sr.getReadNegativeStrandFlag()) { - byte[] sq = (byte[]) sr.getAttribute("SQ"); - sq = QualityUtils.reverseComplementCompressedQualityArray(sq); - - sr.setAttribute("SQ", sq); - } - - sw.addAlignment(sr); - } - - sw.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/RawRead.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/RawRead.java deleted file mode 100755 index 220299537..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/RawRead.java +++ /dev/null @@ -1,263 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -/** - * RawRead represents lane and tile coordinates, raw intensities, read bases, and quality scores - * - * @author Kiran Garimella - */ -public class RawRead implements Comparable { - private byte lane; - private short tile; - private short x; - private short y; - - private byte[] sequence; - private byte[] quals; - private short[][] intensities; - - /** - * Blank constructor. - */ - public RawRead() {} - - /** - * Construct a raw read from the output of a PasteParser (in the order of int, seq, prb). - * Takes data from entire read. - * - * @param pastedReadString the 3x(fragment length) output array from the PasteParser. - */ - public RawRead(String[][] pastedReadString) { - loadRange(pastedReadString, 0, pastedReadString[1][4].length() - 1); - } - - /** - * Construct a raw read from the output of a PasteParser (in the order of int, seq, prb). - * Takes data within specified cycle ranges. - * - * @param pastedReadString the 3x(fragment length) output array from the PasteParser. - * @param cycleBegin the start cycle for the read (0-based, inclusive) - * @param cycleEnd the end cycle for the read (0-based, inclusive) - */ - public RawRead(String[][] pastedReadString, int cycleBegin, int cycleEnd) { - loadRange(pastedReadString, cycleBegin, cycleEnd); - } - - /** - * Does the actual parsing of the PasteParser output. - * - * @param pastedReadString the 3x(fragment length) output array from the PasteParser. - * @param cycleBegin the start cycle for the read (0-based, inclusive) - * @param cycleEnd the end cycle for the read (0-based, inclusive) - */ - private void loadRange(String pastedReadString[][], int cycleBegin, int cycleEnd) { - lane = Byte.valueOf(pastedReadString[0][0]); - tile = Short.valueOf(pastedReadString[0][1]); - x = Short.valueOf(pastedReadString[0][2]); - y = Short.valueOf(pastedReadString[0][3]); - - sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd + 1).getBytes(); - - quals = new byte[sequence.length]; - //intensities = new short[sequence.length][4]; - intensities = new short[4][sequence.length]; - - for (int cycle = cycleBegin, offset = 0; cycle <= cycleEnd; cycle++, offset++) { - byte maxQual = -50; - - for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) { - byte qual = Byte.valueOf(pastedReadString[2][fullReadIndex]); - - if (qual > maxQual) { maxQual = qual; } - } - - quals[offset] = maxQual >= 0 ? maxQual : 0; - - for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) { - double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]); - short shortChannelIntensity = (short) doubleChannelIntensity; - - //intensities[offset][channel] = shortChannelIntensity; - intensities[channel][offset] = shortChannelIntensity; - } - } - } - - /** - * Get lane number of read. - * - * @return lane number of read - */ - public byte getLane() { return lane; } - - /** - * Set lane number of read. - * - * @param lane lane number of read - */ - public void setLane(byte lane) { this.lane = lane; } - - /** - * Get tile number of read. - * - * @return tile number of read - */ - public short getTile() { return tile; } - - /** - * Set tile number of read. - * - * @param tile tile number of read - */ - public void setTile(short tile) { this.tile = tile; } - - /** - * Get x-coordinate of read. - * - * @return x-coordinate of read - */ - public int getXCoordinate() { return x; } - - /** - * Set x-coordinate of read. - * - * @param x x-coordinate of read - */ - public void setXCoordinate(short x) { this.x = x; } - - /** - * Get y-coordinate of read. - * - * @return y-coordinate of read - */ - public int getYCoordinate() { return y; } - - /** - * Set y-coordinate of read. - * - * @param y y-coordinate of read - */ - public void setYCoordinate(short y) { this.y = y; } - - /** - * Get read key (lane:tile:x:y). - * - * @return read key (lane:tile:x:y) - */ - public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); } - - /** - * Get the read sequence between the cycles specified in the constructor as a byte array. - * - * @return read sequence - */ - public byte[] getSequence() { return sequence; } - - /** - * Set the read sequence from a byte array. - * - * @param sequence the read sequence in byte array form - */ - public void setSequence(byte[] sequence) { this.sequence = sequence; } - - /** - * Get the read sequence as a String. - * - * @return the read sequence in String form - */ - public String getSequenceAsString() { return new String(getSequence()); } - - /** - * Get the quals. - * - * @return a byte array of quals - */ - public byte[] getQuals() { return quals; } - - /** - * Set the quals. - * - * @param quals a byte array of quals - */ - public void setQuals(byte[] quals) { this.quals = quals; } - - /** - * Get the raw read intensities. - * - * @return the (numChannels)x(readLength) array of raw intensities - */ - public short[][] getIntensities() { return intensities; } - - /** - * Set the raw intensities. - * - * @param intensities the (numChannels)x(readLength) array of raw intensities - */ - public void setIntensities(short[][] intensities) { this.intensities = intensities; } - - /** - * Get the read length. - * - * @return the read length - */ - public int getReadLength() { return sequence.length; } - - /** - * Return the sum of the quality scores for this RawRead. - * - * @return the sum of the quality scores - */ - public int getQualityScoreSum() { - int qualSum = 0; - for ( byte qual : quals ) { - qualSum += (int) qual; - } - - return qualSum; - } - - /** - * Compare two RawRead objects by summing their quality scores. The one with lower aggregate quality is the "lesser" RawRead. - * - * @param rawRead the other RawRead - * @return -1, 0, or 1 if the RawRead on which this method is called is the lesser one, is equal to the comparison RawRead, or is greater than the comparison RawRead, respectively. - */ - public int compareTo(RawRead rawRead) { - int qualSum1 = this.getQualityScoreSum(); - int qualSum2 = rawRead.getQualityScoreSum(); - - if (qualSum1 < qualSum2) { return -1; } - else if (qualSum1 > qualSum2) { return 1; } - - return 0; - } - - public RawRead getSubset(int cycleStart, int cycleStop) { - RawRead subRead = new RawRead(); - - subRead.setLane(lane); - subRead.setTile(tile); - subRead.setXCoordinate(x); - subRead.setYCoordinate(y); - - byte[] newSequence = new byte[cycleStop - cycleStart + 1]; - byte[] newQuals = new byte[cycleStop - cycleStart + 1]; - //short[][] newIntensities = new short[cycleStop - cycleStart + 1][4]; - short[][] newIntensities = new short[4][cycleStop - cycleStart + 1]; - - for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) { - newSequence[offset] = sequence[cycle]; - newQuals[offset] = quals[cycle]; - //newIntensities[offset] = intensities[cycle]; - - for (int channel = 0; channel < 4; channel++) { - newIntensities[channel][offset] = intensities[channel][cycle]; - } - } - - subRead.setSequence(newSequence); - subRead.setQuals(newQuals); - subRead.setIntensities(newIntensities); - - return subRead; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/SecondaryBaseAnnotator.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/SecondaryBaseAnnotator.java deleted file mode 100755 index 24473c0b8..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/SecondaryBaseAnnotator.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.broadinstitute.sting.secondarybase; - -import org.broadinstitute.sting.utils.containers.BoundedScoringSet; -import org.broadinstitute.sting.utils.StingException; - -import java.util.ArrayList; -import java.util.Arrays; - -public class SecondaryBaseAnnotator { - private static final int TRAINING_LIMIT = 10000; - private boolean trained; - private final BoundedScoringSet trainingAggregator; - public BasecallingReadModel model; - - public SecondaryBaseAnnotator() { - trained = false; - trainingAggregator = new BoundedScoringSet(TRAINING_LIMIT); - } - - public void addTrainingRead(RawRead rawRead) { trainingAggregator.add(rawRead); } - - public boolean haveEnoughTrainingReads() { return false; } - - public void doneTraining() { - ArrayList trainingData = new ArrayList(trainingAggregator.size()); - - trainingData.addAll(Arrays.asList(trainingAggregator.toArray(new RawRead[0]))); - - model = new BasecallingReadModel(trainingData); - - trained = true; - } - - public FourProbRead getFourProbRead(RawRead rawRead) { - return model.call(rawRead); - } - - public byte[] getSqTagValue(RawRead rawRead) { - if (!trained) { - throw new StingException("Model must be trained via addTrainingRead() before getSqTagValue() can be called"); - } - - FourProbRead fpr = model.call(rawRead); - - return fpr.getSQTag(rawRead); - } - - private void train() {} - - private byte[] getSQTag(FourProbRead fourProbRead, RawRead rawRead) { return null; } - - private static boolean isGoodTrainingRead(RawRead rawRead) { return false; } - - private static double getAverageQualityScore(RawRead rawRead) { return 0.0; } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/IlluminaTile.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/IlluminaTile.java deleted file mode 100755 index 8a7a419ff..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/IlluminaTile.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import edu.mit.broad.picard.util.BasicTextFileParser; -import edu.mit.broad.picard.util.PasteParser; -import org.broadinstitute.sting.secondarybase.RawRead; -import org.broadinstitute.sting.utils.StingException; - -import java.io.Closeable; -import java.io.File; -import java.util.Iterator; - -public class IlluminaTile implements Iterator, Iterable, Closeable { - private PasteParser parser; - - private RawRead next; - private boolean isIterating = false; - - public IlluminaTile(File bustardDir, int lane, int tile) { - //BasicTextFileParser intparser = new BasicTextFileParser(true, new File(bustardDir.getParent() + "/" + String.format("s_%d_%04d_int.txt.gz", lane, tile))); - //BasicTextFileParser seqparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_seq.txt.gz", lane, tile))); - //BasicTextFileParser prbparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_prb.txt.gz", lane, tile))); - - BasicTextFileParser intparser = new BasicTextFileParser(true, findFile(bustardDir.getParentFile(), lane, tile, "int")); - BasicTextFileParser seqparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "seq")); - BasicTextFileParser prbparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "prb")); - - parser = new PasteParser(intparser, seqparser, prbparser); - } - - private File findFile(File dir, int lane, int tile, String ext) { - File file1 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt.gz", lane, tile, ext)); - if (file1.exists()) { return file1; } - - File file2 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt", lane, tile, ext)); - if (file2.exists()) { return file2; } - - throw new StingException(String.format("Can't find file '%s' or '%s'", file1.getName(), file2.getName())); - - } - - public boolean hasNext() { - if (!isIterating) { - iterator(); - } - - return parser.hasNext(); - } - - public RawRead next() { - if (hasNext()) { - next = new RawRead(parser.next()); - return next; - } - - return null; - } - - public void remove() { - throw new StingException("Remove is not implemented by IlluminaTile"); - } - - public void close() { - parser.close(); - isIterating = false; - } - - public Iterator iterator() { - if (isIterating) { - throw new StingException("IlluminaTile is already iterating"); - } - - isIterating = true; - next = new RawRead(parser.next()); - - return this; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedAndSecondarySAMs.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedAndSecondarySAMs.java deleted file mode 100755 index 070f8483c..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedAndSecondarySAMs.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.util.HashMap; - -public class MergeAlignedAndSecondarySAMs extends CommandLineProgram { - public static MergeAlignedAndSecondarySAMs instance = null; - - @Argument(fullName="unaligned_sam", shortName="US", doc="unaligned SAM with secondary bases") public File USAM; - @Argument(fullName="aligned_sam", shortName="AS", doc="aligned SAM without secondary bases") public File ASAM; - @Argument(fullName="sam_out", shortName="SO", doc="output SAM file") public File OSAM; - - public static void main(String[] argv) { - instance = new MergeAlignedAndSecondarySAMs(); - start(instance, argv); - } - - protected int execute() { - // Hash unaligned file - HashMap usamhash = new HashMap(500000); - - SAMFileReader usamr = new SAMFileReader(USAM); - for (SAMRecord usr : usamr) { - String key = String.format("%s:%b", usr.getReadName(), usr.getFirstOfPairFlag()); - - usamhash.put(key, usr); - } - usamr.close(); - - // Annotate aligned file - SAMFileReader asamr = new SAMFileReader(ASAM); - - SAMFileHeader asamh = asamr.getFileHeader(); - asamh.setSortOrder(SAMFileHeader.SortOrder.unsorted); - SAMFileWriter osamw = new SAMFileWriterFactory().makeSAMOrBAMWriter(asamh, true, OSAM); - - for (SAMRecord asr : asamr) { - String key = String.format("%s:%b", asr.getReadName(), asr.getFirstOfPairFlag()); - - if (usamhash.containsKey(key)) { - String abases = asr.getReadString(); - String ubases1 = usamhash.get(key).getReadString(); - String ubases2 = (String) usamhash.get(key).getAttribute("SB"); - - if (asr.getReadNegativeStrandFlag()) { - ubases1 = BaseUtils.simpleReverseComplement(ubases1); - ubases2 = BaseUtils.simpleReverseComplement(ubases2); - } - - byte[] sqbases = new byte[abases.length()]; - for (int cycle = 0; cycle < abases.length(); cycle++) { - char sqbase = (abases.charAt(cycle) == ubases1.charAt(cycle)) ? ubases2.charAt(cycle) : ubases1.charAt(cycle); - int sqBaseIndex = BaseUtils.simpleBaseToBaseIndex(sqbase); - sqbases[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(sqBaseIndex, 0.0); - } - - asr.setAttribute("SQ", sqbases); - - osamw.addAlignment(asr); - } - } - - osamw.close(); - asamr.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedSAMs.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedSAMs.java deleted file mode 100755 index e6fdc68aa..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergeAlignedSAMs.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; - -public class MergeAlignedSAMs extends CommandLineProgram { - public static MergeAlignedSAMs instance = null; - - @Argument(fullName="sam_tile_prefix", shortName="STP", doc="SAM tile prefix") public String SAM_TILE_PREFIX; - @Argument(fullName="sam_tile_suffix", shortName="STS", doc="SAM tile suffix") public String SAM_TILE_SUFFIX; - @Argument(fullName="sam_out", shortName="SO", doc="SAM output file") public File SAM_OUT; - - public static void main(String[] argv) { - instance = new MergeAlignedSAMs(); - start(instance, argv); - } - - protected int execute() { - SAMFileWriter swriter = null; - - for (int tile = 1; tile <= 100; tile++) { - File tileFile = new File(String.format("%s.%d.%s", SAM_TILE_PREFIX, tile, SAM_TILE_SUFFIX)); - - SAMFileReader sreader = new SAMFileReader(tileFile); - - if (swriter == null) { - swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sreader.getFileHeader(), true, SAM_OUT); - } - - System.out.println("Processing " + tileFile.getName() + " ..."); - - for (SAMRecord sr : sreader) { - swriter.addAlignment(sr); - } - - sreader.close(); - } - - if (swriter != null) { - swriter.close(); - } - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergePieces.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergePieces.java deleted file mode 100755 index f9c0c7cce..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/MergePieces.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.util.Date; -import java.util.HashMap; - -public class MergePieces extends CommandLineProgram { - public static MergePieces instance = null; - - @Argument(fullName="unaligned_sam_prefix", shortName="USP", doc="Prefix for unaligned SAM files") public String USAM_PREFIX; - @Argument(fullName="unaligned_sam_suffix", shortName="USS", doc="Suffix for unaligned SAM files") public String USAM_SUFFIX; - @Argument(fullName="tile_start", shortName="TS", doc="Starting tile number") public int TILE_START; - @Argument(fullName="tile_end", shortName="TE", doc="Ending tile number (inclusive)") public int TILE_END; - @Argument(fullName="aligned_sam_in", shortName="ASI", doc="Aligned SAM file") public File ALIGNED_SAM; - @Argument(fullName="annotated_sam_out", shortName="ASO", doc="Annotated SAM output file") public File ANNOTATED_SAM; - - public static void main(String[] argv) { - instance = new MergePieces(); - start(instance, argv); - } - - protected int execute() { - HashMap samHash = new HashMap(30000000); - - System.out.println("Hashing records..."); - for (int tile = TILE_START; tile <= TILE_END; tile++) { - System.out.printf(" %s: Hashing SQ tags from tile %d...\n", (new Date()).toString(), tile); - - File uTileFile = new File(String.format("%s.%d.%s", USAM_PREFIX, tile, USAM_SUFFIX)); - - SAMFileReader uTileReader = new SAMFileReader(uTileFile); - - for (SAMRecord sr : uTileReader) { - String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag()); - - //System.out.printf("name=%s ispaired=%b negstrand=%b firstend=%b secondend=%b\n", sr.getReadName(), sr.getReadPairedFlag(), sr.getReadNegativeStrandFlag(), sr.getFirstOfPairFlag(), sr.getSecondOfPairFlag()); - - samHash.put(key, (byte[]) sr.getAttribute("SQ")); - } - - uTileReader.close(); - } - - SAMFileReader aReader = new SAMFileReader(ALIGNED_SAM); - SAMFileWriter aWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(aReader.getFileHeader(), true, ANNOTATED_SAM); - - System.out.println("Annotating reads..."); - int annotatedReads = 0; - for (SAMRecord sr : aReader) { - String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag()); - - if (samHash.containsKey(key)) { - byte[] sqtag = samHash.get(key); - - if (sr.getReadNegativeStrandFlag()) { - sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag); - } - - sr.setAttribute("SQ", sqtag); - - annotatedReads++; - if (annotatedReads % 100000 == 0) { - System.out.printf(" %s: Annotated %d reads...\n", (new Date()).toString(), annotatedReads); - } - - //aWriter.addAlignment(sr); - } - - aWriter.addAlignment(sr); - } - - aReader.close(); - aWriter.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/SplitAlignedSAMByTile.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/SplitAlignedSAMByTile.java deleted file mode 100755 index be2b0753e..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/SplitAlignedSAMByTile.java +++ /dev/null @@ -1,63 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.util.Date; -import java.util.Vector; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class SplitAlignedSAMByTile extends CommandLineProgram { - public static SplitAlignedSAMByTile instance = null; - - @Argument(fullName="sam_in", shortName="SI", doc="SAM file to split") public File SAM_IN; - @Argument(fullName="sam_out_prefix", shortName="SOP", doc="Output prefix for split SAMs") public String SAM_OUT_PREFIX; - - public static void main(String[] argv) { - instance = new SplitAlignedSAMByTile(); - start(instance, argv); - } - - protected int execute() { - SAMFileReader sreader = new SAMFileReader(SAM_IN); - - SAMFileHeader sheader = sreader.getFileHeader(); - sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted); - - Vector swriters = new Vector(); - swriters.setSize(101); - for (int tile = 1; tile <= 100; tile++) { - File tileFile = new File(String.format("%s.%d.sam", SAM_OUT_PREFIX, tile)); - swriters.add(tile, new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, tileFile)); - } - - int reads = 0; - for (SAMRecord sr : sreader) { - Pattern p = Pattern.compile(":\\d:(\\d+):\\d+:\\d+#"); - Matcher m = p.matcher(sr.getReadName()); - - if (m.find()) { - int tile = Integer.valueOf(m.group(1)); - - swriters.get(tile).addAlignment(sr); - - if (reads % 1000000 == 0) { - System.out.printf("%s: processed %d reads ... \n", (new Date()).toString(), reads); - } - reads++; - } - } - - sreader.close(); - for (SAMFileWriter sfw : swriters) { - if (sfw != null) { - sfw.close(); - } - } - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/TileAnnotator.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/TileAnnotator.java deleted file mode 100755 index e8857f154..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/TileAnnotator.java +++ /dev/null @@ -1,320 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.*; -import org.broadinstitute.sting.secondarybase.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.containers.BoundedScoringSet; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class TileAnnotator extends CommandLineProgram { - public static TileAnnotator instance = null; - - private String currentContig = ""; - private byte[] refbases; - - @Argument(fullName="sam_tile_in", shortName="STI", doc="SAM tile file", required=false) public File SAM_TILE_IN; - @Argument(fullName="sam_tile_out", shortName="STO", doc="Annotated SAM tile output file") public File SAM_TILE_OUT; - @Argument(fullName="reference", shortName="R", doc="The fasta reference") public File REFERENCE; - @Argument(fullName="bustard_dir", shortName="D", doc="The Bustard directory") public File BUSTARD_DIR; - @Argument(fullName="training_limit", shortName="TL", doc="Number of reads to train from", required=false) public int TRAINING_LIMIT = 10000; - @Argument(fullName="run_barcode", shortName="RB", doc="Illumina run barcode") public String RUN_BARCODE; - @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-75,76-151') (0-based, inclusive)") public String CYCLE_RANGES; - @Argument(fullName="lane", shortName="L", doc="The lane to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer lane; - @Argument(fullName="tile", shortName="T", doc="The tile to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer tile; - - public static void main(String[] argv) { - instance = new TileAnnotator(); - start(instance, argv); - } - - protected int execute() { - ArrayList> cycleRanges = getCycleRanges(CYCLE_RANGES); - SecondaryBaseAnnotator sba = new SecondaryBaseAnnotator(); - - System.out.printf("%s: Loading training set...\n", (new Date()).toString()); - loadTrainingData(sba); - - System.out.printf("%s: Calling bases...\n", (new Date()).toString()); - callBases(sba, cycleRanges); - - System.out.println("Done."); - - return 0; - } - - private ArrayList> getCycleRanges(String cycleRangesString) { - ArrayList< Pair > cycleRanges = new ArrayList< Pair >(); - - String[] pieces = cycleRangesString.split(","); - - Pattern p = Pattern.compile("(\\d+)-(\\d+)"); - - for (String piece : pieces) { - Matcher m = p.matcher(piece); - - if (m.find()) { - Integer cycleStart = new Integer(m.group(1)); - Integer cycleStop = new Integer(m.group(2)); - - cycleRanges.add(new Pair(cycleStart, cycleStop)); - } - } - - if (cycleRanges.size() == 0) { - throw new StingException("At least one cycle range must be specified."); - } - - if (cycleRanges.size() > 2) { - throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2."); - } - - return cycleRanges; - } - - private void loadTrainingData(SecondaryBaseAnnotator sba) { - IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile); - - for (RawRead rr : tileParser) { sba.addTrainingRead(rr); } - - tileParser.close(); - - sba.doneTraining(); - } - - private void callBases(SecondaryBaseAnnotator sba, ArrayList> cycleRanges) { - SAMFileHeader sheader = new SAMFileHeader(); - sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted); - SAMFileWriter swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, SAM_TILE_OUT); - - IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile); - - BasecallingStats bstats = new BasecallingStats(); - - for (RawRead rr : tileParser) { - bstats.update(rr, sba.getFourProbRead(rr)); - - byte[] sqtag = sba.getSqTagValue(rr); - - SAMRecord sr = constructSAMRecord(rr, sqtag, sheader, false, false); - - swriter.addAlignment(sr); - } - - bstats.notifyNow(); - - tileParser.close(); - swriter.close(); - } - - private SAMRecord constructSAMRecord(RawRead rr, byte[] sqtag, SAMFileHeader sfh, boolean isPaired, boolean isSecondEndOfPair) { - SAMRecord sr = new SAMRecord(sfh); - - sr.setReadName(String.format("%s:%d:%d:%d:%d#0", RUN_BARCODE, lane, tile, rr.getXCoordinate(), rr.getYCoordinate())); - sr.setReadUmappedFlag(true); - sr.setReadString(rr.getSequenceAsString()); - sr.setBaseQualities(rr.getQuals()); - sr.setAttribute("SQ", sqtag); - - sr.setReadPairedFlag(isPaired); - if (isPaired) { - sr.setMateUnmappedFlag(true); - sr.setFirstOfPairFlag(!isSecondEndOfPair); - sr.setSecondOfPairFlag(isSecondEndOfPair); - } - - return sr; - } - - /* - private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, boolean isPaired, boolean isSecondEndOfPair) { - SAMRecord sr = new SAMRecord(sfh); - - sr.setReadName(String.format("%s:%d:%d:%d:%d#0", RUN_BARCODE, lane, tile, rr.getXCoordinate(), rr.getYCoordinate())); - sr.setReadUmappedFlag(true); - sr.setReadString(rr.getSequenceAsString()); - sr.setBaseQualities(rr.getQuals()); - sr.setAttribute("SQ", fpr.getSQTag(rr)); - - sr.setReadPairedFlag(isPaired); - if (isPaired) { - sr.setMateUnmappedFlag(true); - sr.setFirstOfPairFlag(!isSecondEndOfPair); - sr.setSecondOfPairFlag(isSecondEndOfPair); - } - - return sr; - } - - private void callBases(BasecallingReadModel model, ArrayList> cycleRanges) { - SAMFileHeader sheader = new SAMFileHeader(); - sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted); - SAMFileWriter swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, SAM_TILE_OUT); - - IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile); - - BasecallingStats bstats = new BasecallingStats(); - - for (RawRead rr : tileParser) { - FourProbRead fpr = model.call(rr); - - for (int rangeIndex = 0; rangeIndex < cycleRanges.size(); rangeIndex++) { - FourProbRead fprEnd = fpr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond()); - RawRead rrEnd = rr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond()); - - SAMRecord sr = constructSAMRecord(rrEnd, fprEnd, sheader, cycleRanges.size() > 1, rangeIndex == 1); - - swriter.addAlignment(sr); - } - - bstats.update(rr, fpr); - bstats.notifyOnInterval(10000); - } - - bstats.notifyNow(); - - tileParser.close(); - swriter.close(); - } - - private ArrayList loadTrainingData() { - FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE); - HashMap srs = loadTileAlignments(ref); - return loadGoodReads(srs, BUSTARD_DIR); - } - - private HashMap loadTileAlignments(FastaSequenceFile2 ref) { - HashMap srs = new HashMap(); - HashSet seenEnds = new HashSet(); - - int numPerfect = 0; - - if (SAM_TILE_IN != null && SAM_TILE_IN.exists()) { - SAMFileReader sreader = new SAMFileReader(SAM_TILE_IN); - - for (SAMRecord sr : sreader) { - Pattern p = Pattern.compile(":(\\d+):(\\d+):(\\d+):(\\d+)#"); - Matcher m = p.matcher(sr.getReadName()); - - if (m.find()) { - this.lane = Integer.valueOf(m.group(1)); - this.tile = Integer.valueOf(m.group(2)); - int x = Integer.valueOf(m.group(3)); - int y = Integer.valueOf(m.group(4)); - boolean end = sr.getReadPairedFlag() && sr.getSecondOfPairFlag(); - - String otherKey = String.format("%d:%d:%b", x, y, !end); - String currentKey = String.format("%d:%d:%b", x, y, end); - - seenEnds.add(currentKey); - - if (isWellAligned(sr, ref)) { - if (srs.containsKey(otherKey) || !seenEnds.contains(otherKey)) { - srs.put(currentKey, sr); - } - - if (srs.containsKey(currentKey) && srs.containsKey(otherKey)) { - numPerfect++; - if (numPerfect % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) { - System.out.println(" " + numPerfect + " well-aligned reads"); - } - } - } else { - if (srs.containsKey(otherKey)) { - srs.remove(otherKey); - } - } - } - - if (numPerfect >= TRAINING_LIMIT) { break; } - } - - sreader.close(); - } - - return srs; - } - - private boolean isWellAligned(SAMRecord sr, FastaSequenceFile2 ref) { - boolean valid = false; - int mismatches = 0; - - if (!sr.getReadUnmappedFlag() && sr.getCigar().numCigarElements() == 1) { - if (!currentContig.matches(sr.getReferenceName())) { - ref.seekToContig(sr.getReferenceName()); - currentContig = sr.getReferenceName(); - - refbases = ref.nextSequence().getBases(); - } - - byte[] readbases = sr.getReadBases(); - int offset = sr.getAlignmentStart(); - - if (offset + readbases.length < refbases.length) { - valid = true; - - for (int i = offset, j = 0; i < offset + readbases.length; i++, j++) { - int refbase = BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]); - int readbase = BaseUtils.simpleBaseToBaseIndex((char) readbases[j]); - - mismatches += (refbase >= 0 && readbase >= 0 && refbase != readbase) ? 1 : 0; - } - } - } - - return (valid && mismatches == 0); - } - - private ArrayList loadGoodReads(HashMap srs, File bustardDir) { - ArrayList trainingData = new ArrayList(); - BoundedScoringSet additionalData = new BoundedScoringSet(TRAINING_LIMIT); - - IlluminaTile tileParser = new IlluminaTile(bustardDir, lane, tile); - - int correlatedReads = 0; - for (RawRead rr : tileParser) { - String key1 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), false); - String key2 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), true); - - if (srs.containsKey(key1) && srs.containsKey(key2)) { - byte[] quals = new byte[rr.getReadLength()]; - for (int cycle = 0; cycle < rr.getReadLength(); cycle++) { - quals[cycle] = (byte) (BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]) >= 0 ? 50 : 0); - } - rr.setQuals(quals); - - trainingData.add(rr); - - correlatedReads++; - if (correlatedReads % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) { - System.out.println(" " + correlatedReads + " intensity-correlated reads"); - } - } else { - additionalData.add(rr); - } - } - - tileParser.close(); - - System.out.printf(" found %d perfect reads with an optional reservoir of %d good reads\n", trainingData.size(), additionalData.size()); - - RawRead[] qrs = additionalData.toArray(new RawRead[0]); - int limit = (TRAINING_LIMIT - trainingData.size() < additionalData.size()) ? (TRAINING_LIMIT - trainingData.size()) : additionalData.size(); - for (int i = 0; i < limit; i++) { - trainingData.add(qrs[i]); - } - - return trainingData; - } - */ -} diff --git a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/VerifySAM.java b/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/VerifySAM.java deleted file mode 100755 index 836f44ef2..000000000 --- a/archive/java/src/org/broadinstitute/sting/oldsecondarybases/piecemealannotator/VerifySAM.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.playground.piecemealannotator; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.File; - -public class VerifySAM extends CommandLineProgram { - public static VerifySAM instance = null; - - @Argument(fullName="sam_in", shortName="SI", doc="SAM file to check") public File SAM_IN; - - public static void main(String[] argv) { - instance = new VerifySAM(); - start(instance, argv); - } - - protected int execute() { - SAMFileReader sfr = new SAMFileReader(SAM_IN); - - for ( SAMRecord sr : sfr ) { - if (sr.getAttribute("SQ") == null) { - throw new StingException(String.format("SAMRecord is missing an SQ tag\n%s", sr.format())); - } - } - - sfr.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/poolseq/PoolUtils.java b/archive/java/src/org/broadinstitute/sting/poolseq/PoolUtils.java deleted file mode 100755 index 1ab570fd1..000000000 --- a/archive/java/src/org/broadinstitute/sting/poolseq/PoolUtils.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.utils; - -import net.sf.samtools.SAMRecord; - -import java.util.List; -import java.util.ArrayList; - -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Aug 27, 2009 - * Time: 12:31:08 PM - * To change this template use File | Settings | File Templates. - */ -public class PoolUtils { - - private PoolUtils() { - } - - public static ReadOffsetQuad splitReadsByReadDirection(List reads, List offsets) { - ArrayList forwardReads; - ArrayList reverseReads; - ArrayList forwardOffsets; - ArrayList reverseOffsets; - - if (reads == null) { - forwardReads = null; - reverseReads = null; - forwardOffsets = null; - reverseOffsets = null; - } else { - forwardReads = new ArrayList(); - reverseReads = new ArrayList(); - forwardOffsets = new ArrayList(); - reverseOffsets = new ArrayList(); - - for (int readNo = 0; readNo < reads.size(); readNo++) { - if (reads.get(readNo).getReadNegativeStrandFlag()) { - forwardReads.add(reads.get(readNo)); - forwardOffsets.add(offsets.get(readNo)); - } else { - reverseReads.add(reads.get(readNo)); - reverseOffsets.add(offsets.get(readNo)); - } - } - } - - return new ReadOffsetQuad(forwardReads,forwardOffsets,reverseReads,reverseOffsets); - } - - - public static Pair, List> thresholdReadsByQuality(List reads, List offsets, byte qThresh) { - List threshReads; - List threshOffsets; - if (reads == null) { - threshReads = null; - threshOffsets = null; - } else if (qThresh <= 0) { - threshReads = reads; - threshOffsets = offsets; - } else { - threshReads = new ArrayList(); - threshOffsets = new ArrayList(); - - for (int readNo = 0; readNo < reads.size(); readNo++) { - if (reads.get(readNo).getBaseQualities()[offsets.get(readNo)] >= qThresh) { - threshReads.add(reads.get(readNo)); - threshOffsets.add(offsets.get(readNo)); - } // else do nothing - } - } - - return new Pair,List>(threshReads, threshOffsets); - } - - public static Pair,List> thresholdReadsByMappingQuality( List reads, List offsets, int mapQual ) { - List goodMapReads; - List goodMapOffsets; - if ( reads == null ) { - goodMapReads = null; - goodMapOffsets = null; - } else if ( mapQual < 0 ) { - goodMapReads = reads; - goodMapOffsets = offsets; - } else { - goodMapReads = new ArrayList(); - goodMapOffsets = new ArrayList(); - - for ( int readNo = 0; readNo < reads.size(); readNo ++ ) { - if ( reads.get(readNo).getMappingQuality() > mapQual ) { - goodMapReads.add(reads.get(readNo)); - goodMapOffsets.add(offsets.get(readNo)); - } - } - } - - return new Pair,List>(goodMapReads,goodMapOffsets); - } - - -} diff --git a/archive/java/src/org/broadinstitute/sting/poolseq/PowerBelowFrequencyWalker.java b/archive/java/src/org/broadinstitute/sting/poolseq/PowerBelowFrequencyWalker.java deleted file mode 100644 index 98cdae421..000000000 --- a/archive/java/src/org/broadinstitute/sting/poolseq/PowerBelowFrequencyWalker.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.playground.utils.PoolUtils; -import net.sf.samtools.SAMRecord; - -import java.util.List; -import java.io.PrintStream; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Oct 8, 2009 - * Time: 9:44:35 AM - * To change this template use File | Settings | File Templates. - */ - -/** - * Given an input N, this walker calculates the power to detect a polymorphism with N, N-1, N-2, ..., 1 variant alleles in a pooled setting - */ -@By(DataSource.REFERENCE) -public class PowerBelowFrequencyWalker extends LocusWalker { - @Output - PrintStream out; - - @Argument(fullName="lodThreshold", shortName="lod", doc="Threshold for log likelihood ratio to be called a SNP. Defaults to 3.0", required = false) - public double lodThresh = 3.0; - - @Argument(fullName="minimumQScore", shortName="qm", doc="Use bases whose phred (Q) score meets or exceeds this number. Defaults to 0", required = false) - public byte minQ = 0; - - @Argument(fullName="poolSize", shortName="ps", doc="Number of individuals in the pool", required = true) - public int numIndividuals = 0; - - @Argument(fullName="alleleFrequency", shortName="af", doc="Calculate power for all allele frequencies below this. Defaults to 4", required=false) - public int alleleFreq = 4; - - @Argument(fullName="useMeanProb", doc="Use the mean probability as the \"average quality\" rather than median Q-score") - boolean useMean = false; - - @Argument(fullName="minimumMappingQuality", shortName="mmq", doc="Only use reads above this mapping quality in the power calculation", required=false) - int minMappingQuality = -1; - - @Argument(fullName="ignoreForwardReads",doc="Ignore the forward reads at a site. Defaults to false.", required = false) - boolean ignoreForwardReads = false; - - @Argument(fullName="ignoreReverseReads",doc="Ignore the reverse reads at a site. Defaults to false.", required = false) - boolean ignoreReverseReads = false; - - private boolean calledByAnotherWalker = false; - - public void initialize() { - if ( alleleFreq < 1 ) { - String err = "Allele frequency (-af) must be greater than or equal to one."; - throw new StingException(err); - } - - if ( numIndividuals == 0 ) { - calledByAnotherWalker = true; - } - } - - public Integer reduceInit() { - out.print(makeHeader()); - return 0; - } - - public Integer reduce(Integer mapint, Integer prevint) { - return 0; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - String output = String.format("%s", context.getLocation().toString()); - - // threshold reads if necessary - if ( ignoreForwardReads && ignoreReverseReads) { - throw new StingException("User has elected to ignore both forward and reverse reads. Power is zero."); - } - else if ( ! ignoreForwardReads && ignoreReverseReads ) { - org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad rq = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); - context = new AlignmentContext(context.getLocation(),rq.getFirstReads(),rq.getFirstOffsets()); - } else if ( ignoreForwardReads && ! ignoreReverseReads ) { - org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad rq = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); - context = new AlignmentContext(context.getLocation(),rq.getSecondReads(),rq.getSecondOffsets()); - } - - if ( minQ > 0 ) { - Pair, List> thresh = PoolUtils.thresholdReadsByQuality(context.getReads(),context.getOffsets(),minQ); - context = new AlignmentContext(context.getLocation(), thresh.getFirst(), thresh.getSecond()); - } - - if ( minMappingQuality > -1 ) { - Pair,List> goodMaps = PoolUtils.thresholdReadsByMappingQuality(context.getReads(),context.getOffsets(),minMappingQuality); - context = new AlignmentContext(context.getLocation(), goodMaps.getFirst(), goodMaps.getSecond()); - } - - // calculate powers and put into output string - - for ( int i = 1; i <= alleleFreq; i ++ ) { - output = String.format("%s\t%f",output,calculatePowerAtFrequency(context,i)); - } - - // print the output string - - out.printf("%s%n",output); - - return 0; - } - - public double calculatePowerAtFrequency( AlignmentContext context, int alleles ) { - return theoreticalPower( context.size(), getMeanQ(context), alleles, lodThresh ); - } - - public byte getMeanQ( AlignmentContext context ) { - byte meanQ; - if ( useMean ) { - meanQ = QualityUtils.probToQual(expectedMatchRate(context)); - } else { - meanQ = MathUtils.getQScoreMedian(context.getReads(),context.getOffsets()); - } - - return meanQ; - } - - public double expectedMatchRate(AlignmentContext context) { - int nReads = context.size(); - double matches = 0.0; - for ( int r = 0; r < nReads; r ++ ) { - matches += QualityUtils.qualToProb(context.getReads().get(r).getBaseQualities()[context.getOffsets().get(r)]); - } - - return matches/nReads; - } - - public String makeHeader() { - // create the header - String header = "chrm:pos"; - for ( int i = 1; i <= alleleFreq; i ++ ) { - header = header + "\tPower_at_"+Integer.toString(i); - } - - return String.format("%s%n", header); - } - - public double theoreticalPower( int depth, byte q, int alleles, double lodThreshold ) { - double power; - if( depth <= 0 ) { - power = 0.0; - } else { - double p_error = QualityUtils.qualToErrorProb(q); - double snpProp = getSNPProportion(alleles); - double kterm_num = Math.log10( snpProp * (1 - p_error) + (1 - snpProp) * (p_error/3) ); - double kterm_denom = Math.log10( p_error/3 ); - double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) ); - double dkterm_denom = Math.log10( 1 - p_error); - - int kaccept = (int) Math.ceil( ( lodThreshold - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) / - ( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) ); - System.out.println("Error="+p_error+" snpProp="+snpProp+" alleles="+alleles+" lodThreshold="+lodThreshold+" kaccept="+kaccept); - - if (kaccept <= 0) { - power = 0.0; - } else { - // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs - // we can optimize this by checking to see which sum is smaller - if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth] - power = MathUtils.binomialCumulativeProbability(kaccept, depth, depth, snpProp); - } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept] - power = 1-MathUtils.binomialCumulativeProbability(0, kaccept, depth, snpProp); - } - } - } - return power; - } - - private double getSNPProportion(int alleles) { - return ((double)alleles)/(2*numIndividuals); - } - - public void setPoolSize(int poolSize) { - if ( calledByAnotherWalker ) { - numIndividuals = poolSize; - } else { - throw new StingException("This method should only be accessible by calling it from another walker."); - } - } -} diff --git a/archive/java/src/org/broadinstitute/sting/poolseq/Quad.java b/archive/java/src/org/broadinstitute/sting/poolseq/Quad.java deleted file mode 100644 index d258c18c5..000000000 --- a/archive/java/src/org/broadinstitute/sting/poolseq/Quad.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.utils.collections.Pair; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 11, 2009 - * Time: 5:12:29 PM - * To change this template use File | Settings | File Templates. - */ -public class Quad { - public W first; - public X second; - public Y third; - public Z fourth; - - public Quad() { - first = null; - second = null; - third = null; - fourth = null; - } - - public Quad(W w, X x, Y y, Z z) { - first = w; - second = x; - third = y; - fourth = z; - } - - public Quad(Pair a, Pair b) { - first = a.getFirst(); - second = a.getSecond(); - third = b.getFirst(); - fourth = b.getSecond(); - } - - public boolean equals(Object o) { - if(o == null) { - return false; - } else if (! (o instanceof Quad) ) { - return false; - } - - Quad other = (Quad) o; - - return ( equalToNotNull(this.first,other.first) && equalToNotNull(this.second,other.second) - && equalToNotNull(this.third,other.third) && equalToNotNull(this.fourth,other.fourth)); - } - - public int hashCode() { - return getHash(first) ^ getHash(second) ^ getHash(third) ^ getHash(fourth); - } - - public String toString() { - return String.format("(%s, %s, %s, %s)", first.toString(), second.toString(), - third.toString(), fourth.toString()); - } - - public W getFirst() { return first; } - public X getSecond() { return second; } - public Y getThird() { return third; } - public Z getFourth() { return fourth; } - - public Pair getFirstPair() { return new Pair(first,second); } - public Pair getSecondPair() { return new Pair(third,fourth); } - - public void setFirst(Object o) { first = (W) o; } - public void setSecond(Object o) { second = (X) o; } - public void setThird(Object o) { third = (Y) o; } - public void setFourth(Object o) { fourth = (Z) o; } - - private int getHash(Object o) { - int hash = 0; - if(o != null) { - hash = o.hashCode(); - } - return hash; - } - - private boolean equalToNotNull(Object a, Object b) { - boolean areEqual = false; - if ( a != null && b != null ) { - areEqual = a.equals(b); - } else if (a == null && b == null ) { - areEqual = true; - // todo -- make sure we don't want to check for instanceOf here... - // todo -- maybe this statement should be eliminated - } - - return areEqual; - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/poolseq/ReadOffsetQuad.java b/archive/java/src/org/broadinstitute/sting/poolseq/ReadOffsetQuad.java deleted file mode 100644 index 87820f82b..000000000 --- a/archive/java/src/org/broadinstitute/sting/poolseq/ReadOffsetQuad.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import net.sf.samtools.SAMRecord; -import java.util.List; -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 12, 2009 - * Time: 1:21:38 AM - * To change this template use File | Settings | File Templates. - */ -public class ReadOffsetQuad extends Quad,List,List, List> { - /* - * ReadOffsetQuad separates the user from specifying the types of objects required to - * store two sets of read/offset pairs in a quad. - * - * Implements methods that return read/offset pairs as AlignmentContexts - * and allows ReadOffsetQuad to be constructed from two AlignmentContexts - * - */ - - // constructor that IntelliJ wants - public ReadOffsetQuad(List a, List b, List c, List d) { - super(a,b,c,d); - } - - // another constructor that IntelliJ wants - public ReadOffsetQuad(Pair,List> a, Pair,List> b) { - super(a,b); - } - - public ReadOffsetQuad(AlignmentContext a, AlignmentContext b) { - first = a.getReads(); - second = a.getOffsets(); - third = b.getReads(); - fourth = b.getOffsets(); - } - - public int numReads() { - return first.size() + third.size(); - } - - public int numReadsFirst() { - return first.size(); - } - - public int numReadsSecond() { - return third.size(); - } - - public List getFirstReads() { - return this.first; - } - - public List getSecondReads() { - return this.third; - } - - public List getReadsCombined() { - ArrayList combined = new ArrayList(first); - combined.addAll(third); - return combined; - } - - public List getOffsetsCombined() { - ArrayList combined = new ArrayList(second); - combined.addAll(fourth); - return combined; - } - - public List getFirstOffsets() { - return this.second; - } - - public List getSecondOffsets() { - return this.fourth; - } - - public Pair,List> getFirstReadOffsetPair() { - return new Pair,List>(first, second); - } - - public Pair,List> getSecondReadOffsetPair() { - return new Pair,List>(third,fourth); - } - - public AlignmentContext getFirstPairAsAlignmentContext(GenomeLoc loc) { - return new AlignmentContext(loc, first, second); - } - - public AlignmentContext getSecondPairAsAlignmentContext(GenomeLoc loc) { - return new AlignmentContext(loc, third, fourth); - } - -} diff --git a/archive/java/src/org/broadinstitute/sting/poolseq/SQuad.java b/archive/java/src/org/broadinstitute/sting/poolseq/SQuad.java deleted file mode 100644 index 6dc0183a0..000000000 --- a/archive/java/src/org/broadinstitute/sting/poolseq/SQuad.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: Sep 12, 2009 - * Time: 1:04:40 PM - * To change this template use File | Settings | File Templates. - */ -public class SQuad extends Quad { - /* SQuad - single object type Quad - * or "Simple Quad". Makes it less code-intensive - * for user to use quads to hold objects. - */ - - public SQuad(X a, X b, X c, X d) { super(a,b,c,d); } - // don't know why this method even neds to be written - // but IntelliJ wants it to be there - -} diff --git a/archive/java/src/org/broadinstitute/sting/secondaryBases/SecondaryBaseTransitionTableWalker.java b/archive/java/src/org/broadinstitute/sting/secondaryBases/SecondaryBaseTransitionTableWalker.java deleted file mode 100755 index 1bf73e418..000000000 --- a/archive/java/src/org/broadinstitute/sting/secondaryBases/SecondaryBaseTransitionTableWalker.java +++ /dev/null @@ -1,130 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.secondaryBases; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.playground.utils.NamedTable; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.commandline.Output; - -import java.util.HashMap; -import java.io.PrintStream; - -/** - * Given a secondary base annotated .bam file and a reference, this walker generates a table of secondary base counts - * for all called loci in the .bam. Each base call made is an instance included in the table. Specifically, the walker - * maps the following vector to a count of secondary bases: - * . - */ - -@Reference(window=@Window(start=-1,stop=1)) -public class SecondaryBaseTransitionTableWalker extends LocusWalker { - @Output - PrintStream out; - - HashMap counts = new HashMap(); - private UnifiedGenotyperEngine ug; - private NamedTable altTable; - - public void initialize() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.STANDARD_CONFIDENCE_FOR_CALLING = uac.STANDARD_CONFIDENCE_FOR_EMITTING = 50.0; - uac.ALL_BASES_MODE = true; - ug = new UnifiedGenotyperEngine(getToolkit(), uac); - - altTable = new NamedTable(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - char refBase = Character.toUpperCase(ref.getBaseAsChar()); - ReadBackedPileup pileup = context.getBasePileup(); - int[] baseCounts = pileup.getBaseCounts(); - int length = 0; - for (int i : baseCounts) {length += i;} - byte[] contextBases = ref.getBases(); - byte prevBase = (byte)Character.toUpperCase(contextBases[0]); - byte nextBase = (byte)Character.toUpperCase(contextBases[contextBases.length - 1]); - - if (contextBases.length == 3 && refBase != 'N' && pileup.getBases() != null && pileup.getSecondaryBases() != null) { - VariantCallContext ugResult = ug.runGenotyper(tracker,ref,context); - if (ugResult != null && ugResult.vc != null) { - Genotype res = ugResult.vc.getGenotype(0); - String call = res.getGenotypeString(); - String type; - String alleleBalance = "N/A"; - if (res.isHomRef()) { - type = "homref"; - } - else if (!res.isHet()) {type = "homvar";} - else if (call.contains(Character.toString(refBase))) { - type = "het"; - char alt; - if (call.charAt(0) == refBase) {alt = call.charAt(1);} - else {alt = call.charAt(0);} - double refCount = baseCounts[BaseUtils.simpleBaseToBaseIndex(refBase)]; - double altCount = baseCounts[BaseUtils.simpleBaseToBaseIndex(alt)]; - alleleBalance = Double.toString(Math.round(100.0*refCount/(refCount + altCount))/100.0); - } - else {type = "bad";} - if (!type.equals("bad")) { - for (PileupElement element : pileup) { - char primaryBase = Character.toUpperCase((char)element.getBase()); - char secondaryBase = Character.toUpperCase((char)element.getSecondBase()); - String RG = element.getRead().getReadGroup().getReadGroupId(); - if (secondaryBase != 'N' && secondaryBase != '.' && primaryBase != 'N') { - String strandRef; - String strandPrimary; - String strandPrev; - String strandSecondary; - String strandCall; - if (!element.getRead().getReadNegativeStrandFlag()) { - strandRef = Character.toString(refBase); - strandPrimary = Character.toString(primaryBase); - strandPrev = Character.toString((char)prevBase); - strandSecondary = Character.toString(secondaryBase); - strandCall = call; - } - else { - strandRef = Character.toString(BaseUtils.simpleComplement(refBase)); - strandPrimary = Character.toString(BaseUtils.simpleComplement(primaryBase)); - strandPrev = Character.toString(BaseUtils.simpleComplement((char)nextBase)); - strandSecondary = Character.toString(BaseUtils.simpleComplement(secondaryBase)); - strandCall = BaseUtils.simpleReverseComplement(call); - } - if (strandPrev.charAt(0) != 'N') { - String key = type+' '+RG+' '+strandCall+' '+alleleBalance+' '+strandRef+' '+strandPrimary+' '+strandPrev+' '+strandSecondary; - if (counts.containsKey(key)) { - counts.put(key, counts.get(key) + Long.valueOf(1)); - } - else { - counts.put(key, Long.valueOf(1)); - } - } - } - } - } - } - } - return 1; - } - - public Integer reduceInit() {return 0;} - - public Integer reduce(Integer value, Integer sum) {return sum + value;} - - public void onTraversalDone(Integer result) { - out.println(">>>"); - out.println("Type ReadGroup CalledGenotype AlleleBalance ReferenceBase PrimaryBase PreviousBase SecondaryBase Count"); - for (String key : counts.keySet()) { - out.println(key + ' ' + counts.get(key).toString()); - } - out.println("Processed " + result.toString() + " loci."); - } -} \ No newline at end of file diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/GenotypeConcordance.java b/archive/java/src/org/broadinstitute/sting/vcftools/GenotypeConcordance.java deleted file mode 100644 index 536081ad1..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/GenotypeConcordance.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; - -class GenotypeConcordance - { - String name; - - protected int[][] counts = {{0,0,0}, - {0,0,0}, - {0,0,0}}; - - public GenotypeConcordance(String name) - { - this.name = name; - } - - public void add(char ref, String g1, String g2) - { - int g1_dosage = 0; - int g2_dosage = 0; - - if (g1.charAt(0) != ref) { g1_dosage += 1; } - if (g1.charAt(1) != ref) { g1_dosage += 1; } - - if (g2.charAt(0) != ref) { g2_dosage += 1; } - if (g2.charAt(1) != ref) { g2_dosage += 1; } - - counts[g1_dosage][g2_dosage] += 1; - } - - public void add(GenotypeConcordance G) - { - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - counts[i][j] += G.counts[i][j]; - } - } - } - - public String toLine() - { - int on_diag = 0; - int on_diag_not_homref = 0; - int off_diag = 0; - int total = 0; - int total_not_homref = 0; - - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - if (i == j) { on_diag += counts[i][j]; } - if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; } - if (i != j) { off_diag += counts[i][j]; } - if (i != 0 || j != 0) { total_not_homref += counts[i][j]; } - total += counts[i][j]; - } - } - - String s = String.format("SNP %s %d %d %f %f\n", this.name, total, total_not_homref, this.errorRate(), this.hetErrorRate()); - return s; - } - - public String toString() - { - String s = this.name + "\n"; - - int on_diag = 0; - int on_diag_not_homref = 0; - int off_diag = 0; - int total = 0; - int total_not_homref = 0; - - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - s += counts[i][j] + "\t"; - - if (i == j) { on_diag += counts[i][j]; } - if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; } - if (i != j) { off_diag += counts[i][j]; } - if (i != 0 || j != 0) { total_not_homref += counts[i][j]; } - total += counts[i][j]; - } - s += "\n"; - } - - //s += String.format("On-Diagonal = %.02f\n", 100.0 * (double)on_diag / (double)total); - //s += String.format("On-Diagonal (not hom-ref) = %.02f\n", 100.0 * (double)on_diag_not_homref / (double)total_not_homref); - //s += String.format("Off-Diagonal = %.02f\n", 100.0 * (double)off_diag / (double)total_not_homref); - s += String.format("Total = %d\n", total); - s += String.format("Total (not hom-ref) = %d\n", total_not_homref); - s += String.format("Error Rate = %f\n", this.errorRate()); - - s += "\n"; - return s; - } - - public double errorRate() - { - int off_diag = 0; - int total_not_homref = 0; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - if (i != j) { off_diag += counts[i][j]; } - if (i != 0 || j != 0) { total_not_homref += counts[i][j]; } - } - } - double error_rate = (double)off_diag / (double)total_not_homref; - return error_rate; - } - - public double hetErrorRate() - { - int true_hets = 0; - int correct_hets = 0; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - if (j == 1) { true_hets += counts[i][j]; } - } - } - correct_hets = counts[1][1]; - double het_error_rate = 1.0 - ((double)correct_hets / (double)true_hets); - return het_error_rate; - } - - - public int total() - { - int total = 0; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - total += counts[i][j]; - } - } - return total; - } - - public int totalNonHomRef() - { - int total = 0; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - if (i != 0 || j != 0) { total += counts[i][j]; } - } - } - return total; - } - - } diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFApplyCuts.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFApplyCuts.java deleted file mode 100644 index 8c1bc4c1e..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFApplyCuts.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; - -//import org.broadinstitute.sting.playground.tools.vcf.VCFOptimize.Cut; - -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import org.broadinstitute.sting.utils.genotype.vcf.*; - - -import java.io.*; -import java.util.*; - -//import org.apache.commons.math.optimization.*; -//import org.apache.commons.math.optimization.direct.*; -//import org.apache.commons.math.analysis.MultivariateRealFunction; - -// First draft of a program for working with VCF files in various ways. - - -/** - * @author jmaguire - */ - - -class VCFApplyCuts extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "cuts", shortName = "cuts", doc = "file to read cuts from", required = true) public String cuts_filename; - @Argument(fullName = "output", shortName = "output", doc = "file to write filtered VCF to", required = true) public String output_filename; - - class Cut - { - public double lod; - public double slod; - public int freq; - - public Cut(double lod, double slod) - { - this.lod = lod; - this.slod = slod; - this.freq = -1; - } - - public Cut(double lod, double slod, int freq) - { - this.lod = lod; - this.slod = slod; - this.freq = freq; - } - - public Cut(String record) - { - String[] tokens = record.split("\\s+"); - this.freq = Integer.parseInt(tokens[0]); - this.lod = Double.parseDouble(tokens[1]); - this.slod = Double.parseDouble(tokens[2]); - } - - public String toString() - { - return String.format("%d %f %f", freq, lod, slod); - } - } - - private boolean applyCuts(ArrayList cuts, VCFHeader header, VCFRecord record) - { - Map info = record.getInfoValues(); - - if (! info.containsKey("AC")) - { - throw new RuntimeException("AC not present in record: \n" + record.toStringEncoding(header)); - } - if (! info.containsKey("DP")) - { - throw new RuntimeException("DP not present in record: \n" + record.toStringEncoding(header)); - } - if (! info.containsKey("SB")) - { - throw new RuntimeException("SB not present in record: \n" + record.toStringEncoding(header)); - } - - boolean transition = VCFTool.isTransition(record); - int freq = Integer.parseInt(record.getInfoValues().get("AC")); - double LOD = record.getQual(); - double depth = Double.parseDouble(record.getInfoValues().get("DP")); - double SLOD = Double.parseDouble(record.getInfoValues().get("SB")); - - for (int i = 0; i < cuts.size(); i++) - { - Cut cut = cuts.get(i); - if (cut.freq == freq) - { - if ((LOD >= cut.lod) && (-1*SLOD >= cut.slod)) { return true; } - } - } - - return false; - } - - - @Override - protected int execute() - { - // load cuts. - - ArrayList cuts = null; - Scanner cuts_file = null; - try - { - cuts = new ArrayList(); - cuts_file = new Scanner(new File(cuts_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - while (cuts_file.hasNextLine()) - { - String line = cuts_file.nextLine(); - Cut cut = new Cut(line); - cuts.add(cut); - } - - - VCFReader reader = null; - - if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(filename)); } - - VCFHeader header = reader.getHeader(); - - - VCFWriter writer = new VCFWriter(new File(output_filename)); - writer.writeHeader(header); - - - Date start_time = new Date(); - int n_records_processed = 0; - int n_records_passed = 0; - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - - if (applyCuts(cuts, header, record) == true) - { - writer.addRecord(record); - n_records_passed += 1; - } - - n_records_processed += 1; - } - System.out.printf("Processed %d records\n", n_records_processed); - System.out.printf("Passed %d records\n", n_records_passed); - - writer.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFCallRates.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFCallRates.java deleted file mode 100644 index 6e5d441c1..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFCallRates.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFGenotypeEncoding; -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.*; -import java.util.*; - -import net.sf.picard.util.Interval; -import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; - - -class VCFCallRates extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false; - @Argument(fullName = "min_call_rate", shortName = "min_call_rate", doc = "what fraction of samples must have a call", required = false) public double min_call_rate = 0.9; - - @Override - protected int execute() - { - //System.out.println("Loading " + filename + "..."); - - PrintStream output = null; - try - { - output = new PrintStream(new FileOutputStream(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - VCFReader reader; - - if (autocorrect) - { - reader = new VCFReader(new File(filename),new VCFHomogenizer()); - } - else - { - reader = new VCFReader(new File(filename)); - } - - VCFHeader header = reader.getHeader(); - VCFRecord record = reader.next(); - - String[] sample_names = record.getSampleNames(); - int[] individual_counts = new int[sample_names.length]; - int[] individual_drops = new int[sample_names.length]; - - while(true) - { - if (record == null) { break; } - - Interval interval = VCFTool.getIntervalFromRecord(record); - - // (unless it is "filtered") - if (record.isFiltered()) - { - record = reader.next(); - } - - char ref = record.getReference().charAt(0); - - String[] new_sample_names = record.getSampleNames(); - if (new_sample_names.length != sample_names.length) { throw new RuntimeException(); } - for (int i = 0; i < new_sample_names.length; i++) { if (! sample_names[i].equals(new_sample_names[i])) { throw new RuntimeException(); } } - - List genotypes = record.getVCFGenotypeRecords(); - - long n_ref = 0; - long n_alt = 0; - long n_total = 0; - long n_calls = 0; - long n_dropped = 0; - - for (int i = 0; i < sample_names.length; i++) - { - VCFGenotypeRecord rec = genotypes.get(i); - - Long gq; - - if (rec.getFields().get("GQ") != null) - { - Double gq_double = Double.parseDouble(rec.getFields().get("GQ")); - gq = gq_double.longValue(); - } - else - { - gq = 0L; - } - - List alleles = rec.getAlleles(); - - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - n_total += 1; - - individual_counts[i] += 1; - if (g.equals("..")) - { - n_dropped += 1; - individual_drops[i] += 1; - continue; - } - n_calls += 1; - if (g.charAt(0) == ref) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == ref) { n_ref += 1; } else { n_alt += 1; } - } - - output.printf("SNP %s %d %d %f\n", interval, n_total, n_dropped, (double)n_dropped / (double)n_total); - - record = reader.next(); - } - - - // Now output the statistics. - - for (int i = 0; i < sample_names.length; i++) - { - int n_total = individual_counts[i]; - int n_dropped = individual_drops[i]; - output.printf("INDIVIDUAL %s %d %d %f\n", sample_names[i], n_total, n_dropped, (double)n_dropped / (double)n_total); - } - - output.flush(); - output.close(); - - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFHomogenizer.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFHomogenizer.java deleted file mode 100644 index 9e8de6d28..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFHomogenizer.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; - -import org.broad.tribble.vcf.VCFCodec; - -import java.io.*; -import java.util.zip.*; - -// Edit a VCF on the fly to be on-spec. - -/** - * @author jmaguire - */ - -class VCFHomogenizer implements VCFCodec.LineTransform { - - //my ($chr, $off, $id, $ref, $alt, $qual, $filter, $info, $format, @genotypes) = @tokens; - public String lineTransform(String input) - { - if (input == null) { return null; } - - //System.out.println("input : " + input); - - // Make it tab-delimited - input = input.replaceAll(" +", "\t"); - - ///////// - // Header corrections - if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; } - if (input.startsWith("#CHROM")) { return input.replaceAll("PROB", "QUAL"); } - if (input.startsWith("#")) { return input; } - - ///////// - // Line-level corrections - - // make "nan" into "NaN" - input = input.replaceAll("nan", "NaN"); - input = input.replaceAll("DB(\\;|\\s)", "DB=1$1"); - input = input.replaceAll("HM2(\\;|\\s)", "HM2=1$1"); - input = input.replaceAll("HM3(\\;|\\s)", "HM3=1$1"); - - String[] tokens = input.split("\\s+"); - - ///////// - // Token-level corrections - - // if alt is "N", make it "." - if (tokens[4].equals("N")) { tokens[4] = "."; } - if (tokens[5].equals(".")) { tokens[5] = "-1"; } - - String ref = tokens[3]; - String alt = tokens[4]; - String[] alts = alt.split(","); - - for (int i = 9; i < tokens.length; i++) - { - if (tokens[i].equals(".")) { tokens[i] = "./.:0"; } - - tokens[i] = tokens[i].replaceAll(ref, "0"); - if (! alt.equals(".")) - { - if (alts.length == 1) - { - tokens[i] = tokens[i].replaceAll(alt, "1"); - } - else - { - for (int j = 0; j < alts.length; j++) - { - tokens[i] = tokens[i].replaceAll(alts[j], "1"); - } - } - } - } - - ///////// - // Info-level corrections - - String info = tokens[7]; - String new_info = ""; - String[] info_tokens = info.split(";"); - for (int i = 0; i < info_tokens.length; i++) - { - if (info_tokens[i].startsWith("R2=")) - { - // Fix NaN's in RNaN's in R2. - String new_token = info_tokens[i].replace("NaN", "0.0"); - info_tokens[i] = new_token; - } - else if (info_tokens[i].startsWith("AC=")) - { - // Fix the case where AC includes the ref count first. - String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(","); - if (ACs.length == alts.length+1) - { - String new_ACs = ""; - for (int j = 1; j < ACs.length; j++) - { - new_ACs += ACs[j]; - if (j != (ACs.length-1)) { new_ACs += ","; } - } - info_tokens[i] = "AC=" + new_ACs; - continue; - } - } - - new_info += info_tokens[i]; - if (i != (info_tokens.length-1)) { new_info += ";"; } - } - tokens[7] = new_info; - - - ///////// - // Now put it back together and emit. - String output = tokens[0]; - for (int i = 1; i < tokens.length; i++) - { - output = output + "\t" + tokens[i]; - } - output = output + "\n"; - - //System.out.println("output: " + output); - - return output; - } -} - - - diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFMerge.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFMerge.java deleted file mode 100644 index a9a258818..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFMerge.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import org.broadinstitute.sting.utils.genotype.vcf.*; - - -import java.io.*; - -import net.sf.picard.util.Interval; - -//import org.apache.commons.math.optimization.*; -//import org.apache.commons.math.optimization.direct.*; -//import org.apache.commons.math.analysis.MultivariateRealFunction; - -// Program for frequency-specific VCF-files. - - -/** - * @author jmaguire - */ - - -class VCFMerge extends CommandLineProgram -{ - - @Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1; - @Argument(fullName = "vcf2", shortName = "vcf2", doc = "file to open", required = true) public String filename2; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print way too much debugging output", required = false) public Boolean verbose = false; - - @Override - protected int execute() - { - VCFReader reader1; - VCFReader reader2; - - if (autocorrect) - { - reader1 = new VCFReader(new File(filename1),new VCFHomogenizer()); - reader2 = new VCFReader(new File(filename2),new VCFHomogenizer()); - } - else - { - reader1 = new VCFReader(new File(filename1)); - reader2 = new VCFReader(new File(filename2)); - } - - VCFHeader header1 = reader1.getHeader(); - VCFHeader header2 = reader2.getHeader(); - - VCFRecord record1 = reader1.next(); - VCFRecord record2 = reader2.next(); - - VCFWriter writer = new VCFWriter(new File(output_filename)); - writer.writeHeader(header1); - - while(true) - { - if ((record1 == null) && (record2 == null)) { break; } - else if (record1 == null) { writer.addRecord(record2); record2 = reader2.next(); continue; } - else if (record2 == null) { writer.addRecord(record1); record1 = reader1.next(); continue; } - - if (verbose) - { - System.out.printf("RECORD1: %s\n", record1.toStringEncoding(header1)); - System.out.printf("RECORD2: %s\n", record2.toStringEncoding(header2)); - } - - if (record1.isFiltered()) { record1 = reader1.next(); continue; } - if (record2.isFiltered()) { record2 = reader2.next(); continue; } - - Interval interval1 = VCFTool.getIntervalFromRecord(record1); - Interval interval2 = VCFTool.getIntervalFromRecord(record2); - - int comparison = VCFTool.compareIntervals(interval1, interval2); - - if (comparison == 0) - { - // records match! Emit one. - writer.addRecord(record1); - record1 = reader1.next(); - record2 = reader2.next(); - } - else if (comparison > 0) - { - writer.addRecord(record2); - record2 = reader2.next(); - } - else if (comparison < 0) - { - writer.addRecord(record1); - record1 = reader1.next(); - } - } - - writer.close(); - - return 0; - } -} - - diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFOptimize.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFOptimize.java deleted file mode 100644 index eeefbb842..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFOptimize.java +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; - - -import java.io.*; -import java.util.*; - -//import org.apache.commons.math.optimization.*; -//import org.apache.commons.math.optimization.direct.*; -//import org.apache.commons.math.analysis.MultivariateRealFunction; - -// Program for frequency-specific VCF-files. - - -/** - * @author jmaguire - */ - - -class VCFOptimize extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07; - @Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename; - @Argument(fullName = "min_calls", shortName = "min_calls", doc = "Minimum signifigant number of calls", required=false) public int min_calls = 100; - @Argument(fullName = "num_breaks", shortName = "num_breaks", doc = "Number of breaks to search over", required=false) public int num_breaks = 100; - - // Debugging arguments: - @Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print detailed debugging info.", required=false) public boolean verbose = false; - - class OptimizationRecord - { - public boolean transition; - public int freq; - public double[] features; - - public OptimizationRecord(boolean transition, int freq, double[] features) - { - this.transition = transition; - this.freq = freq; - this.features = features.clone(); - } - - public OptimizationRecord clone() - { - return new OptimizationRecord(transition, freq, features.clone()); - } - } - - private OptimizationRecord pack(VCFHeader header, VCFRecord input) - { - Map info = input.getInfoValues(); - - if (! info.containsKey("AC")) - { - throw new RuntimeException("AC not present in record: \n" + input.toStringEncoding(header)); - } - if (! info.containsKey("DP")) - { - throw new RuntimeException("DP not present in record: \n" + input.toStringEncoding(header)); - } - if (! info.containsKey("SB")) - { - throw new RuntimeException("SB not present in record: \n" + input.toStringEncoding(header)); - } - - boolean transition = VCFTool.isTransition(input); - int freq = Integer.parseInt(input.getInfoValues().get("AC")); - double LOD = input.getQual(); - double depth = Double.parseDouble(input.getInfoValues().get("DP")); - double SLOD = Double.parseDouble(input.getInfoValues().get("SB")); - - double[] features = new double[2]; - features[0] = LOD; - features[1] = -1*SLOD; - - return new OptimizationRecord(transition, freq, features); - } - - // This is the objective function we're searching in. - // if (tstv>=min) { return #snps; } else { return -inf; } - public double tstv(double[] point, OptimizationRecord[] records) - { - double transitions = 0; - double transversions = 0; - double total = 0; - for (int i = 0; i < records.length; i++) - { - int j = 0; - for (j = 0; j < point.length; j++) - { -// if (records == null) { System.out.printf("records==null\n"); } -// if (records[i] == null) { System.out.printf("records[%d]==null\n", i); } -// if (records[i].features == null) { System.out.printf("records[%d].features==null\n", i); } - - if (records[i].features[j] < point[j]) { break; } - } - if (j == point.length) - { - if (records[i].transition == true) { transitions += 1; } - else { transversions += 1; } - total += 1; - } - } - - double tstv = transitions / transversions; - return tstv; - } - - - // This is the objective function we're searching in. - // if (tstv>=min) { return #snps; } else { return -inf; } - public double num_calls(double[] point, OptimizationRecord[] records) - { - double total = 0; - for (int i = 0; i < records.length; i++) - { - int j = 0; - for (j = 0; j < point.length; j++) - { -// if (records == null) { System.out.printf("records==null\n"); } -// if (records[i] == null) { System.out.printf("records[%d]==null\n", i); } -// if (records[i].features == null) { System.out.printf("records[%d].features==null\n", i); } - - if (records[i].features[j] < point[j]) { break; } - } - if (j == point.length) - { - total += 1; - } - } - - return total; - } - - - public class Cut - { - public double lod; - public double slod; - public int freq; - - public Cut(double lod, double slod) - { - this.lod = lod; - this.slod = slod; - this.freq = -1; - } - - public Cut(double lod, double slod, int freq) - { - this.lod = lod; - this.slod = slod; - this.freq = freq; - } - - public Cut(String record) - { - String[] tokens = record.split("\\s+"); - this.freq = Integer.parseInt(tokens[0]); - this.lod = Double.parseDouble(tokens[1]); - this.slod = Double.parseDouble(tokens[2]); - } - - public String toString() - { - return String.format("%d %f %f", freq, lod, slod); - } - } - - - // Just a simple grid search. - private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq) - { - - - double[] lods = new double[records.length]; - double[] slods = new double[records.length]; - for (int i = 0; i < lods.length; i++) - { - lods[i] = records[i].features[0]; - slods[i] = records[i].features[1]; - } - - Arrays.sort(lods); - Arrays.sort(slods); - - double[] lod_breaks = new double[num_breaks]; - double[] slod_breaks = new double[num_breaks]; - int bin_size = 1 + (records.length / num_breaks); - - //System.out.printf("BREAKS i j lod slod\n"); - int j = 0; - for (int i = 0; i < records.length; i += bin_size) - { - lod_breaks[j] = lods[i]; - slod_breaks[j] = slods[i]; - j += 1; - //System.out.printf("BREAKS %d %d %f %f\n", i, j, lods[i], slods[i]); - } - //System.out.printf("\n"); - - double best_lod = lod_breaks[0]; - double best_slod = slod_breaks[0]; - - int best_lod_idx = 0; - int best_slod_idx = 0; - - double[] point = new double[2]; - point[0] = best_lod; - point[1] = best_slod; - - double best_tstv = tstv(point, records); - double best_num_calls = num_calls(point, records); - boolean flag = false; - - //for (double lod = 0; lod < 8000; lod += 10) - for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1) - { - double lod = lod_breaks[lod_idx]; - //for (double slod = -4000; slod < 1000; slod += 10) - for (int slod_idx = 0; slod_idx < num_breaks; slod_idx += 1) - { - double slod = slod_breaks[slod_idx]; - - point = new double[2]; - point[0] = lod; - point[1] = slod; - double tstv = tstv(point, records); - double num_calls = num_calls(point, records); - - if (num_calls < min_calls) { continue; } - - if ((tstv >= min_TsTv) && (num_calls > best_num_calls)) - { - best_lod=lod; - best_slod=slod; - best_tstv=tstv; - best_num_calls=num_calls; - best_lod_idx = lod_idx; - best_slod_idx = slod_idx; - flag=true; - } - else if ((tstv >= best_tstv) && (!flag)) - { - best_lod=lod; - best_slod=slod; - best_tstv=tstv; - best_num_calls=num_calls; - best_lod_idx = lod_idx; - best_slod_idx = slod_idx; - } - - - if (verbose) - { - System.out.printf("DEBUG: %d | %d %d | %f %f %f %f | %f %f %f %f\n", - freq, - lod_idx, slod_idx, - lod, slod, num_calls, tstv, - best_lod, best_slod, best_num_calls, best_tstv); - } - } - } - - //System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv); - System.out.printf("%d %d %d %f %f %f %f\n", freq, best_lod_idx, best_slod_idx, best_lod, best_slod, best_num_calls, best_tstv); - - return new Cut(best_lod, best_slod); - } - - @Override - protected int execute() - { - System.out.println("Loading " + filename + "..."); - - VCFReader reader = null; - - if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(filename)); } - - PrintWriter output = null; - try - { - output = new PrintWriter(new FileWriter(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - VCFHeader header = reader.getHeader(); - - HashMap> records = new HashMap>(); - - Date start_time = new Date(); - int n_records_processed = 0; - int max_freq = 0; - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - - OptimizationRecord optimization_record = pack(header, record); - - if (optimization_record.freq > max_freq) { max_freq = optimization_record.freq; } - - if (! records.containsKey(optimization_record.freq)) { records.put(optimization_record.freq, new ArrayList()); } - records.get(optimization_record.freq).add(optimization_record.clone()); - - n_records_processed += 1; - - if (n_records_processed == n_records_to_process) { break; } - } - System.out.printf("Loaded %d records\n", n_records_processed); - - //for (int freq = 1; freq <= 5; freq += 1) - for (int freq = 1; freq <= max_freq; freq += 1) - { - if (records.get(freq) == null) { System.out.printf("Skipping AAF %d (no calls)\n", freq); continue; } - System.out.printf("\nOptimizing AAF %d...\n", freq); - - OptimizationRecord[] fnord = new OptimizationRecord[records.get(freq).size()]; - Cut cut = optimize(records.get(freq).toArray(fnord), target_TsTv, freq); - cut.freq = freq; - - output.println(cut); - } - output.flush(); - output.close(); - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis.java deleted file mode 100644 index 3e04ac2bb..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.*; - -import net.sf.picard.util.Interval; -import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; - - -class VCFSequenomAnalysis extends CommandLineProgram -{ - @Argument(fullName = "sequenom", shortName = "sequenom", doc = "file to open", required = true) public String filename1; - @Argument(fullName = "sequencing", shortName = "sequencing", doc = "file to open", required = true) public String filename2; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = false) public String output_filename = "/dev/stdout"; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = true; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false; - @Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1; - - - @Override - protected int execute() - { - //System.out.println("Loading " + filename + "..."); - - PrintStream output = null; - try - { - output = new PrintStream(new FileOutputStream(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - output.printf("interval flag ref alt missing_base n_total_sequenom failure_rate_sequenom n_alt_sequencing HWE_sequencing_chi HWE_sequenom_chi HWE_sequencing_p HWE_sequenom_p\n"); - - VCFReader reader1; - VCFReader reader2; - - if (autocorrect) - { - reader1 = new VCFReader(new File(filename1),new VCFHomogenizer()); - reader2 = new VCFReader(new File(filename2),new VCFHomogenizer()); - } - else - { - reader1 = new VCFReader(new File(filename1)); - reader2 = new VCFReader(new File(filename2)); - } - - VCFHeader header1 = reader1.getHeader(); - VCFHeader header2 = reader2.getHeader(); - - VCFRecord record1 = reader1.next(); - VCFRecord record2 = reader2.next(); - - - while(true) - { - if (record1 == null) { break; } - if (record2 == null) { break; } - - String[] sample_names = record2.getSampleNames(); - - Interval interval1 = VCFTool.getIntervalFromRecord(record1); - Interval interval2 = VCFTool.getIntervalFromRecord(record2); - - int comparison = interval1.compareTo(interval2); - - if (comparison == 0) - { - // records match! compute concordance. - - // (unless one of them is "filtered") - if (record1.isFiltered() || record2.isFiltered()) - { - record1 = reader1.next(); - record2 = reader2.next(); - continue; - } - - char ref = record1.getReference().charAt(0); - char alt = VCFTool.getAlt(record2); - - int n_total_sequenom = VCFTool.Compute_n_total(record1); - double failure_rate_sequenom = VCFTool.Compute_failure_rate(record1); - - int n_alt_sequenom = VCFTool.Compute_n_alt(record1); - int n_alt_sequencing = VCFTool.Compute_n_alt(record2); - - double HWE_sequenom = VCFTool.Compute_HWE(record1, sample_names); - double HWE_sequencing = VCFTool.Compute_HWE(record2); - - boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false; - boolean isPolymorphic_sequencing = (n_alt_sequencing > 0) ? true : false; - - String flag = null; - char missing_base = '.'; - - if (isPolymorphic_sequenom) - { - flag = "TP"; - if (n_alt_sequenom == n_total_sequenom) { missing_base = ref; } - } - else - { - flag = "FP"; - missing_base = alt; - } - - output.printf("%s %s %c %c %c %d %f %d %f %f %f %f\n", - interval1, - flag, - ref, - alt, - missing_base, - n_total_sequenom, - failure_rate_sequenom, - n_alt_sequencing, - HWE_sequencing, - HWE_sequenom, - VCFTool.P_from_Chi(HWE_sequencing), - VCFTool.P_from_Chi(HWE_sequenom)); - - record1 = reader1.next(); - record2 = reader2.next(); - } - else if (comparison > 0) - { - // interval1 is later than interval2. - //System.err.printf("Skipping (2): %s\n", VCFTool.getIntervalFromRecord(record2)); - record2 = reader2.next(); - } - else if (comparison < 0) - { - // interval2 is later than interval1. - //System.err.printf("Skipping (1): %s\n", VCFTool.getIntervalFromRecord(record1)); - record1 = reader1.next(); - } - - } - - output.flush(); - output.close(); - - - return 0; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis2.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis2.java deleted file mode 100644 index 262e9a42a..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis2.java +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFGenotypeEncoding; -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.*; -import java.util.*; -import java.lang.*; - -import net.sf.picard.util.Interval; -import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; - - -class VCFSequenomAnalysis2 extends CommandLineProgram -{ - @Argument(fullName = "sequenom", shortName = "sequenom", doc = "file to open", required = true) public String filename1; - @Argument(fullName = "sequencing", shortName = "sequencing", doc = "file to open", required = true) public String filename2; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = false) public String output_filename = "/dev/stdout"; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = true; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false; - @Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1; - - - @Override - protected int execute() - { - //System.out.println("Loading " + filename + "..."); - - PrintStream output = null; - try - { - output = new PrintStream(new FileOutputStream(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - output.printf("PROBE interval flag ref alt n_total_sequenom failure_rate_sequenom n_alt_sequencing n_alt_sequenom p_alt_sequencing p_alt_sequenom HWE_sequencing_chi HWE_sequenom_chi HWE_sequencing_p HWE_sequenom_p is_singleton_in_sequencing singleton_matched_in_sequenom n_sequencing_hets n_sequenom_hets num_hets_also_het_in_sequenom num_hets_dropped_in_sequenom\n"); - - VCFReader reader1; - VCFReader reader2; - - if (autocorrect) - { - reader1 = new VCFReader(new File(filename1),new VCFHomogenizer()); - reader2 = new VCFReader(new File(filename2),new VCFHomogenizer()); - } - else - { - reader1 = new VCFReader(new File(filename1)); - reader2 = new VCFReader(new File(filename2)); - } - - VCFHeader header1 = reader1.getHeader(); - VCFHeader header2 = reader2.getHeader(); - - VCFRecord record1 = reader1.next(); - VCFRecord record2 = reader2.next(); - - int[] sequenom_aaf_counts = new int[1024]; - int[] sequencing_aaf_counts = new int[1024]; - int max_aaf = 0; - - while(true) - { - if (record1 == null) { break; } - if (record2 == null) { break; } - - Interval interval1 = VCFTool.getIntervalFromRecord(record1); - Interval interval2 = VCFTool.getIntervalFromRecord(record2); - - int comparison = interval1.compareTo(interval2); - - if (comparison == 0) - { - // records match! compute concordance. - - // (unless one of them is "filtered") - if (record1.isFiltered() || record2.isFiltered()) - { - record1 = reader1.next(); - record2 = reader2.next(); - continue; - } - - String[] sample_names = record2.getSampleNames(); - - char ref = record1.getReference().charAt(0); - char alt = VCFTool.getAlt(record2); - - int n_total_sequenom = VCFTool.Compute_n_total(record1, sample_names); - int n_total_sequencing = VCFTool.Compute_n_total(record2, sample_names); - double failure_rate_sequenom = VCFTool.Compute_failure_rate(record1); - - int n_alt_sequenom = VCFTool.Compute_n_alt(record1, sample_names); - int n_alt_sequencing = VCFTool.Compute_n_alt(record2, sample_names); - - double p_alt_sequenom = (double)n_alt_sequenom / (double)n_total_sequenom; - double p_alt_sequencing = (double)n_alt_sequencing / (double)n_total_sequencing; - - int n_het_sequenom = VCFTool.Compute_n_het(record1, sample_names); - int n_het_sequencing = VCFTool.Compute_n_het(record2, sample_names); - - sequenom_aaf_counts[n_alt_sequenom] += 1; - sequencing_aaf_counts[n_alt_sequencing] += 1; - if (n_alt_sequenom > max_aaf) { max_aaf = n_alt_sequenom; } - if (n_alt_sequencing > max_aaf) { max_aaf = n_alt_sequencing; } - - double HWE_sequenom = VCFTool.Compute_HWE(record1, sample_names); - double HWE_sequencing = VCFTool.Compute_HWE(record2, sample_names); - - boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false; - boolean isPolymorphic_sequencing = (n_alt_sequencing > 0) ? true : false; - - int is_singleton_in_sequencing = 0; - int singleton_matched_in_sequenom = 0; - - if ((n_alt_sequencing == 1) && (VCFTool.Compute_n_alt(record1) > 0)) - { - is_singleton_in_sequencing = 1; - singleton_matched_in_sequenom = CheckSingletonMatch(record2, record1); - } - - int[] het_match_ans = ComputeHetMatches(record2, record1); - int num_hets_also_het_in_sequenom = het_match_ans[0]; - int num_hets_dropped_in_sequenom = het_match_ans[1]; - - String flag = null; - if (isPolymorphic_sequenom) { flag = "TP"; } - else { flag = "FP"; } - - output.printf("PROBE %s %s %c %c %d %f %d %d %f %f %f %f %f %f %d %d %d %d %d %d\n", - interval1, - flag, - ref, - alt, - n_total_sequenom, - failure_rate_sequenom, - n_alt_sequencing, - n_alt_sequenom, - p_alt_sequencing, - p_alt_sequenom, - HWE_sequencing, - HWE_sequenom, - VCFTool.P_from_Chi(HWE_sequencing), - VCFTool.P_from_Chi(HWE_sequenom), - is_singleton_in_sequencing, - singleton_matched_in_sequenom, - n_het_sequencing, - n_het_sequenom, - num_hets_also_het_in_sequenom, - num_hets_dropped_in_sequenom); - - record1 = reader1.next(); - record2 = reader2.next(); - } - else if (comparison > 0) - { - // interval1 is later than interval2. - //System.err.printf("Skipping (2): %s\n", VCFTool.getIntervalFromRecord(record2)); - record2 = reader2.next(); - } - else if (comparison < 0) - { - // interval2 is later than interval1. - //System.err.printf("Skipping (1): %s\n", VCFTool.getIntervalFromRecord(record1)); - record1 = reader1.next(); - } - - } - - for (int i = 0; i < max_aaf; i++) - { - output.printf("AAF %d %d %d\n", i, sequenom_aaf_counts[i], sequencing_aaf_counts[i]); - } - - - output.flush(); - output.close(); - - - return 0; - } - - int CheckSingletonMatch(VCFRecord sequencing, VCFRecord sequenom) - { - String singleton_name = ""; - - // first, check sequencing - String[] sample_names = sequencing.getSampleNames(); - List genotypes = sequencing.getVCFGenotypeRecords(); - int n_ref = 0; - int n_alt = 0; - for (int i = 0; i < sample_names.length; i++) - { - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == sequencing.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; } - if (g.charAt(1) == sequencing.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; } - } - if (n_alt != 1) { throw new RuntimeException(); } - if (singleton_name.equals("")) { throw new RuntimeException(); } - - // now, check sequenom - sample_names = sequenom.getSampleNames(); - genotypes = sequenom.getVCFGenotypeRecords(); - n_ref = 0; - n_alt = 0; - for (int i = 0; i < sample_names.length; i++) - { - if (sample_names[i].equals(singleton_name)) - { - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == sequenom.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; } - if (g.charAt(1) == sequenom.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; } - break; - } - } - if (n_alt > 0) { return 1; } - else if (n_ref != 0) { return 0; } - else { return -1; } - } - - int[] ComputeHetMatches(VCFRecord sequencing, VCFRecord sequenom) - { - // first, check sequencing - String[] sample_names = sequencing.getSampleNames(); - List genotypes = sequencing.getVCFGenotypeRecords(); - ArrayList het_samples = new ArrayList(); - for (int i = 0; i < sample_names.length; i++) - { - int n_ref = 0; - int n_alt = 0; - - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == sequencing.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == sequencing.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (n_alt == 1) { het_samples.add(sample_names[i]); } - } - - // now, check sequenom - sample_names = sequenom.getSampleNames(); - genotypes = sequenom.getVCFGenotypeRecords(); - int matched_hets = 0; - int dropped_hets = 0; - int mismatched_hets = 0; - int num_hets = het_samples.size(); - for (int i = 0; i < sample_names.length; i++) - { - if (het_samples.contains(sample_names[i])) - { - het_samples.remove(sample_names[i]); - - int n_ref = 0; - int n_alt = 0; - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { dropped_hets += 1; continue; } - if (g.charAt(0) == sequenom.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == sequenom.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (n_alt == 1) { matched_hets += 1; } - else { mismatched_hets += 1; } - } - } - - if ((matched_hets + dropped_hets + mismatched_hets) != num_hets) - { - String warning = String.format("WARNING: %d + %d + %d != %d ", - matched_hets, - dropped_hets, - mismatched_hets, - num_hets); - for (int i = 0; i < het_samples.size(); i++) { warning += het_samples.get(i) + " "; } - System.out.println(warning); - } - - int[] ans = new int[2]; - ans[0] = matched_hets; - ans[1] = dropped_hets; - return ans; - } -} diff --git a/archive/java/src/org/broadinstitute/sting/vcftools/VCFTool.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFTool.java deleted file mode 100644 index c6a436eac..000000000 --- a/archive/java/src/org/broadinstitute/sting/vcftools/VCFTool.java +++ /dev/null @@ -1,1570 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools.vcf; -import org.broad.tribble.vcf.VCFGenotypeEncoding; -import org.broad.tribble.vcf.VCFGenotypeRecord; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Argument; - -import org.broadinstitute.sting.utils.genotype.vcf.*; - -import org.broadinstitute.sting.utils.GenomeLocParser; - - -import java.io.*; -import java.util.*; -import java.util.zip.*; - -import net.sf.picard.util.Interval; -import net.sf.picard.reference.ReferenceSequenceFileWalker; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; - - -// First draft of a program for working with VCF files in various ways. - - -/** - * @author jmaguire - */ - - -class VCFValidate extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "print", shortName = "print", doc = "print the vcf records to output", required = false) public Boolean print = false; - @Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false; - @Argument(fullName = "out", shortName = "out", doc = "if --print, write to this file (default is /dev/stdout)", required = false) public String out = "/dev/stdout"; - - @Override - protected int execute() - { - System.out.println("Validating " + filename + "..."); - - VCFReader reader = null; - - if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(filename)); } - - VCFHeader header = reader.getHeader(); - - VCFWriter writer = null; - if (print) - { - writer = new VCFWriter(new File(out)); - writer.writeHeader(header); - } - - Date start_time = new Date(); - int n_records_processed = 0; - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - if (print) { writer.addRecord(record); } - - if ((profile) && (n_records_processed % 10000 == 0)) - { - Date current_time = new Date(); - long elapsed = current_time.getTime() - start_time.getTime(); - System.out.printf("RUNTIME: %d records processed in %f seconds; %f seconds per record.\n", - n_records_processed, - (double)elapsed/1000.0, - ((double)elapsed/1000.0)/(double)n_records_processed); - } - n_records_processed += 1; - } - - if (print) { writer.close(); } - - if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); } - else { System.out.println(filename + " is VALID."); } - - return 0; - } -} - -class VCFStats extends CommandLineProgram -{ - @Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "locus", shortName = "locus", doc = "file listing loci to extract", required = true) public String locus_string; - - - @Override - protected int execute() - { - VCFReader reader = null; - - String[] tokens = locus_string.split("\\:|\\-"); - String chr = tokens[0]; - String start = tokens[1]; - String stop = tokens[2]; - Interval locus = new Interval(chr, Integer.parseInt(start), Integer.parseInt(stop)); - - if (autocorrect) - reader = new VCFReader(new File(in_filename),new VCFHomogenizer()); - else - reader = new VCFReader(new File(in_filename)); - - VCFHeader header = reader.getHeader(); - - - ////////////// - // Stats collectors - int transitions = 0; - int transversions = 0; - int dbsnp = 0; - int total_snps = 0; - int[] AC_histogram = new int[1000]; int highest_AC = 0; - int[] DP_histogram = new int[1000000]; int highest_DP = 0; - - int[] AC_transitions = new int[1000]; - int[] DP_transitions = new int[1000]; - - int depth_sum = 0; - - boolean before = true; - - while(reader.hasNext()) - { - VCFRecord record = null; - try - { - record = reader.next(); - } - catch (Exception e) - { - System.err.printf("WARNING: %s\n", e.toString()); - continue; - } - Interval this_locus = VCFTool.getIntervalFromRecord(record); - if (locus.intersects(this_locus)) - { - before = false; - - Map info = record.getInfoValues(); - - int AC = 0; - int DP = 0; - int DB = 0; - - if (info.containsKey("AC")) { AC = Integer.parseInt(info.get("AC")); } - if (info.containsKey("DP")) { DP = Integer.parseInt(info.get("DP")); } - if (info.containsKey("DB")) { DB = Integer.parseInt(info.get("DB")); } - - depth_sum += DP; - - dbsnp += DB; // 1 if in dbsnp, 0 otherwise - - AC_histogram[AC] += 1; - if (AC > highest_AC) { highest_AC = AC; } - - DP_histogram[DP] += 1; - if (DP > highest_DP) { highest_DP = DP; } - - if (VCFTool.isTransition(record)) { transitions += 1; AC_transitions[AC] += 1; DP_transitions[DP] += 1; } - else { transversions += 1; } - - total_snps += 1; - //System.out.printf("%s\n", record.toStringEncoding(header)); - } - else if ((before == false) && (this_locus.compareTo(locus) > 0)) { break; } - } - - double mean_depth = (double)depth_sum / (double)total_snps; - double snp_rate = 1.0 / ((double)total_snps / (double)locus.length()); - - int DP_running_sum = 0; - int DP_1percent_low = -1; - int DP_5percent_low = -1; - for (int DP = 1; DP <= highest_DP; DP++) - { - if ((DP_1percent_low == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_low = DP; } - if ((DP_5percent_low == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_low = DP; } - DP_running_sum += DP_histogram[DP]; - } - - DP_running_sum = 0; - int DP_1percent_high = -1; - int DP_5percent_high = -1; - for (int DP = highest_DP; DP >= 0; DP--) - { - if ((DP_1percent_high == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_high = DP; } - if ((DP_5percent_high == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_high = DP; } - DP_running_sum += DP_histogram[DP]; - } - - - System.out.printf("Locus : %s\n", locus.toString()); - System.out.printf("Total SNPs : %d\n", total_snps); - System.out.printf("SNP Rate : 1/%f\n", snp_rate); - System.out.printf("Ts/Tv : %.02f\n", (double)transitions / (double)transversions); - System.out.printf("%%dbsnp : %.02f\n", 100.0 * (double)dbsnp / (double)total_snps); - System.out.printf("Average Depth : %f\n", mean_depth); - System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high); - System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high); - System.out.printf("\n"); - - System.out.printf("table\tAAF\tCount\tTs/Tv\n"); - for (int AC = 1; AC <= highest_AC; AC++) - { - System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC])); - } - System.out.printf("\n"); - - - System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n"); - for (int DP = 1; DP <= highest_DP; DP++) - { - System.out.printf("%d\t%d\t%f\n", DP, DP_histogram[DP], (double)DP_transitions[DP]/(double)(DP_histogram[DP]-DP_transitions[DP])); - } - System.out.printf("\n"); - - return 0; - } - -} - -class CheckRefFields extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "fasta", shortName = "fasta", doc = "reference FASTA", required = true) public String fasta_filename; - - @Override - protected int execute() - { - System.out.println("Checking " + filename + "..."); - - VCFReader reader = null; - - if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(filename)); } - - ReferenceSequenceFileWalker ref = new ReferenceSequenceFileWalker(new File(fasta_filename)); - String ref_seq_name = ""; - byte[] ref_seq = null; - SAMSequenceDictionary ref_dict = ref.getSequenceDictionary(); - - VCFHeader header = reader.getHeader(); - - Date start_time = new Date(); - int n_records_processed = 0; - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - - String chr = record.getChr(); - if (! chr.equals(ref_seq_name)) - { - System.out.println("Loading " + chr); - ref_seq = ref.get(ref_dict.getSequence(chr).getSequenceIndex()).getBases(); - ref_seq_name = chr; - } - - long offset = record.getStart(); - char vcf_ref_base = record.getReference().charAt(0); - char fasta_ref_base = (char)ref_seq[(int)offset-1]; - - List alleles = record.getAlternateAlleles(); - char vcf_alt_base = alleles.get(0).getBases().charAt(0); - - //System.out.println(chr + " " + offset + " " + fasta_ref_base + " " + vcf_ref_base + " " + vcf_alt_base); - - String ans = null; - if (vcf_ref_base != fasta_ref_base) - { - System.out.println("Error! Ref field does not match fasta. Fasta says " + fasta_ref_base); - System.out.println(record.toStringEncoding(header)); - } - } - - System.out.println("All reference fields correct."); - return 0; - } -} - - -class FixRefFields extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "fasta", shortName = "fasta", doc = "reference FASTA", required = true) public String fasta_filename; - @Argument(fullName = "output", shortName = "output", doc = "output file", required = true) public String output_filename; - - @Override - protected int execute() - { - System.out.println("Fixing " + filename + "..."); - - VCFReader reader = null; - - if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(filename)); } - - ReferenceSequenceFileWalker ref = new ReferenceSequenceFileWalker(new File(fasta_filename)); - String ref_seq_name = ""; - byte[] ref_seq = null; - SAMSequenceDictionary ref_dict = ref.getSequenceDictionary(); - - - VCFHeader header = reader.getHeader(); - - PrintStream output; - try - { - VCFWriter writer = new VCFWriter(new File(output_filename)); - writer.writeHeader(header); - writer.close(); - output = new PrintStream(new FileOutputStream(output_filename, true)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - Date start_time = new Date(); - int n_records_processed = 0; - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - - String chr = record.getChr(); - if (! chr.equals(ref_seq_name)) - { - System.out.println("Loading " + chr); - ref_seq = ref.get(ref_dict.getSequence(chr).getSequenceIndex()).getBases(); - ref_seq_name = chr; - } - - long offset = record.getStart(); - char vcf_ref_base = record.getReference().charAt(0); - char fasta_ref_base = (char)ref_seq[(int)offset-1]; - - List alleles = record.getAlternateAlleles(); - char vcf_alt_base = alleles.get(0).getBases().charAt(0); - - //System.out.println(chr + " " + offset + " " + fasta_ref_base + " " + vcf_ref_base + " " + vcf_alt_base); - - String ans = null; - if ((vcf_ref_base != fasta_ref_base) && ((vcf_alt_base == fasta_ref_base) || (vcf_alt_base == '.'))) - { - // swap! - String s = record.toStringEncoding(header); - String[] tokens = s.split("\\s+"); - tokens[3] = Character.toString(fasta_ref_base); - tokens[4] = Character.toString(vcf_ref_base); - for (int i = 9; i < tokens.length; i++) - { - tokens[i] = tokens[i].replaceAll("0", "A"); - tokens[i] = tokens[i].replaceAll("1", "B"); - tokens[i] = tokens[i].replaceAll("B", "0"); - tokens[i] = tokens[i].replaceAll("A", "1"); - } - - ans = ""; - for (int i = 0; i < tokens.length; i++) - { - ans = ans + tokens[i] + "\t"; - } - ans.replaceAll("\\s+$", ""); - - //System.out.println("from: " + s); - //System.out.println("to: " + ans); - } - else - { - ans = record.toStringEncoding(header); - } - - output.println(ans); - } - - output.flush(); - output.close(); - - System.out.println("Done."); - return 0; - } -} - -class VCFGrep extends CommandLineProgram -{ - @Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename; - @Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename; - - @Override - protected int execute() - { - HashSet loci = new HashSet(); - try - { - Scanner loci_reader; - - if (loci_filename.endsWith(".gz")) { loci_reader = new Scanner(new GZIPInputStream(new FileInputStream(loci_filename))); } - else { loci_reader = new Scanner(new File(loci_filename)); } - - while(loci_reader.hasNextLine()) - { - String line = loci_reader.nextLine(); - line = line.replaceAll("\\s+", ""); - loci.add(line); - } - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - try - { - PrintStream output = new PrintStream(new File(out_filename)); - - Scanner reader; - if (in_filename.endsWith(".gz")) { reader = new Scanner(new GZIPInputStream(new FileInputStream(in_filename))); } - else { reader = new Scanner(new File(in_filename)); } - while(reader.hasNextLine()) - { - String line = reader.nextLine(); - - if (line.matches("^\\#.*$")) { output.print(line + "\n"); continue; } - - String[] tokens = line.split("\\s+"); - String locus = tokens[0] + ":" + tokens[1]; - if (loci.contains(locus)) { output.print(line + "\n"); continue; } - } - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - return 0; - } - -} - -class VCFGrep_old extends CommandLineProgram -{ - @Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename; - @Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename; - - @Override - protected int execute() - { - VCFReader reader = null; - VCFWriter writer = null; - - HashSet loci = new HashSet(); - try - { - Scanner loci_reader = new Scanner(new File(loci_filename)); - while(loci_reader.hasNextLine()) - { - String line = loci_reader.nextLine(); - String[] tokens = line.split("\\:"); - - String chr = tokens[0]; - String off = tokens[1]; - loci.add(new Interval(chr, Integer.parseInt(off), Integer.parseInt(off))); - } - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - if (autocorrect) { reader = new VCFReader(new File(in_filename),new VCFHomogenizer()); } - else { reader = new VCFReader(new File(in_filename)); } - - - writer = new VCFWriter(new File(out_filename)); - writer.writeHeader(reader.getHeader()); - - while(reader.hasNext()) - { - VCFRecord record = reader.next(); - Interval locus = VCFTool.getIntervalFromRecord(record); - if (loci.contains(locus)) { writer.addRecord(record); } - } - writer.close(); - - return 0; - } - -} - -class PrintGQ extends CommandLineProgram -{ - @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; - - @Override - protected int execute() - { - VCFReader reader; - VCFReader reader2; - - reader = new VCFReader(new File(filename),new VCFHomogenizer()); - - VCFHeader header = reader.getHeader(); - VCFRecord record = reader.next(); - - while(true) - { - if (record == null) { break; } - - Interval interval = VCFTool.getIntervalFromRecord(record); - - if (record.isFiltered()) - { - record = reader.next(); - } - - char ref = record.getReference().charAt(0); - - String[] sample_names = record.getSampleNames(); - - List genotypes = record.getVCFGenotypeRecords(); - - for (int i = 0; i < sample_names.length; i++) - { - VCFGenotypeRecord rec = genotypes.get(i); - - String gq = rec.getFields().get("GQ"); - - List alleles = rec.getAlleles(); - - String g = ""; - - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - - Arrays.sort(c); - - g = new String(c); - - System.out.println(g + " " + gq); - } - - record = reader.next(); - } - - return 0; - } -} - -class VCFSimpleStats extends CommandLineProgram -{ - @Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false; - @Argument(fullName = "min_call_rate", shortName = "min_call_rate", doc = "what fraction of samples must have a call", required = false) public double min_call_rate = 0.9; - - @Override - protected int execute() - { - //System.out.println("Loading " + filename + "..."); - - PrintStream output = null; - try - { - output = new PrintStream(new FileOutputStream(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - VCFReader reader1; - - if (autocorrect) - { - reader1 = new VCFReader(new File(filename1),new VCFHomogenizer()); - } - else - { - reader1 = new VCFReader(new File(filename1)); - } - - VCFHeader header1 = reader1.getHeader(); - - VCFRecord record1 = reader1.next(); - - int TP = 0; - int FP = 0; - int TN = 0; - int FN = 0; - int total = 0; - int dropped = 0; - - int ts = 0; - int tv = 0; - - while(true) - { - if (record1 == null) { break; } - - Interval interval1 = VCFTool.getIntervalFromRecord(record1); - - // (unless it is "filtered") - if (record1.isFiltered()) - { - record1 = reader1.next(); - } - - char ref = record1.getReference().charAt(0); - - - String[] sample_names1 = record1.getSampleNames(); - - List genotypes1 = record1.getVCFGenotypeRecords(); - - long n_ref_1 = 0; - long n_alt_1 = 0; - long n_total_1 = 0; - long n_calls_1 = 0; - long n_dropped_1 = 0; - - for (int i = 0; i < sample_names1.length; i++) - { - VCFGenotypeRecord rec1 = genotypes1.get(i); - - //if (rec2 == null) { continue; } - - Long gq1; - - if (rec1.getFields().get("GQ") != null) - { - Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ")); - gq1 = gq1_double.longValue(); - } - else - { - gq1 = 0L; - } - - List alleles1 = rec1.getAlleles(); - - String g1 = ""; - - for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); } - - char[] c1 = g1.toCharArray(); - - Arrays.sort(c1); - - g1 = new String(c1); - - n_total_1 += 1; - - if (g1.equals("..")) - { - n_dropped_1 += 1; - continue; - } - - n_calls_1 += 1; - - if (g1.charAt(0) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; } - if (g1.charAt(1) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; } - } - - if (((double)n_calls_1 / (double)n_total_1) >= min_call_rate) - { - if (n_alt_1 == 0) { FP += 1; } - if (n_alt_1 > 0) { TP += 1; } - total += 1; - - if (VCFTool.isTransition(record1)) { ts += 1; } - else { tv += 1; } - } - else - { - dropped += 1; - } - - if ((verbose) && (((double)n_calls_1 / (double)n_total_1) >= min_call_rate)) - { - //output.printf("SNP " - // + interval1.toString() - // + " " + n_total_1 + " " + n_calls_1 + " " + (double)n_calls_1/(double)n_total_1 + " " + n_ref_1 + " " + n_alt_1 + "\n"); - if (n_alt_1 == 0) { output.printf("FP: %s\n", interval1.toString()); } - if (n_alt_1 != 0) { output.printf("TP: %s\n", interval1.toString()); } - } - - record1 = reader1.next(); - } - - - // Now output the statistics. - - output.printf("TP FP dropped ts tv ts/tv\n%d(%f) %d(%f) %d %d %d %f\n", - TP, (double)TP/(double)total, - FP, (double)FP/(double)total, - dropped, - ts, tv, - (double)ts/(double)tv); - - output.flush(); - output.close(); - - - return 0; - } -} - - -class VCFConcordance extends CommandLineProgram -{ - @Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1; - @Argument(fullName = "vcf2", shortName = "vcf2", doc = "file to open", required = true) public String filename2; - @Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename; - @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false; - @Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false; - @Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1; - @Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null; - @Argument(fullName = "r2_bin_size", shortName = "r2_bin_size", doc = "size of an r2 bin for calculating error rates", required = false) public double r2_bin_size = 0.01; - - - @Override - protected int execute() - { - //System.out.println("Loading " + filename + "..."); - - ///////////////////////////////// - // All the various concordance counters - - HashMap individual = new HashMap(); - HashMap AAF = new HashMap(); - HashMap Qual = new HashMap(); - HashMap R2 = new HashMap(); - - int shared_ts = 0; - int shared_tv = 0; - int shared_dbsnp = 0; - int shared_total = 0; - - int unique1_ts = 0; - int unique1_tv = 0; - int unique1_dbsnp = 0; - int unique1_total = 0; - - int unique2_ts = 0; - int unique2_tv = 0; - int unique2_dbsnp = 0; - int unique2_total = 0; - - // - ///////////////////////////////// - - HashSet sample_mask = new HashSet(); - if (samples_filename != null) - { - Scanner samples_reader = null; - try - { - samples_reader = new Scanner(new File(samples_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - while(samples_reader.hasNextLine()) - { - String line = samples_reader.nextLine(); - line.replaceAll("^\\s+|\\s+$", ""); - sample_mask.add(line); - } - } - - - PrintStream output = null; - try - { - output = new PrintStream(new FileOutputStream(output_filename)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - - VCFReader reader1; - VCFReader reader2; - - if (autocorrect) - { - reader1 = new VCFReader(new File(filename1),new VCFHomogenizer()); - reader2 = new VCFReader(new File(filename2),new VCFHomogenizer()); - } - else - { - reader1 = new VCFReader(new File(filename1)); - reader2 = new VCFReader(new File(filename2)); - } - - VCFHeader header1 = reader1.getHeader(); - VCFHeader header2 = reader2.getHeader(); - - VCFRecord record1 = reader1.next(); - VCFRecord record2 = reader2.next(); - - int number_sites_unique_to_file1 = 0; - int number_sites_unique_to_file2 = 0; - int number_sites_shared = 0; - - while(true) - { - if (record1 == null) { break; } - if (record2 == null) { break; } - - - Interval interval1 = VCFTool.getIntervalFromRecord(record1); - Interval interval2 = VCFTool.getIntervalFromRecord(record2); - - //int comparison = interval1.compareTo(interval2); - int comparison = VCFTool.compareIntervals(interval1, interval2); - - //System.out.println("DBG: " + interval1 + " " + interval2 + " " + comparison); - - if (comparison == 0) - { - // records match! compute concordance. - - // (unless one of them is "filtered") - if (record1.isFiltered() || record2.isFiltered()) - { - record1 = reader1.next(); - record2 = reader2.next(); - continue; - } - - - char ref = record1.getReference().charAt(0); - - String[] sample_names1 = record1.getSampleNames(); - String[] sample_names2 = record2.getSampleNames(); - - - Map info1 = record1.getInfoValues(); - Map info2 = record2.getInfoValues(); - double r2_1 = 0; - double r2_2 = 0; - if (info1.containsKey("R2")) { r2_1 = Double.parseDouble(info1.get("R2")); } - if (info2.containsKey("R2")) { r2_2 = Double.parseDouble(info2.get("R2")); } - - - number_sites_shared += 1; - if (VCFTool.isTransition(record1)) { shared_ts += 1; } - else { shared_tv += 1; } - if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { shared_dbsnp += 1; } - shared_total += 1; - - - List genotypes1 = record1.getVCFGenotypeRecords(); - List genotypes2 = record2.getVCFGenotypeRecords(); - - Map map2 = new HashMap(); - for (int i = 0; i < genotypes2.size(); i++) - { - map2.put(genotypes2.get(i).getSampleName(), genotypes2.get(i)); - } - - GenotypeConcordance SNP = new GenotypeConcordance(interval1.toString()); - - long n_ref = 0; - long n_alt = 0; - - for (int i = 0; i < sample_names1.length; i++) - { - if ((samples_filename != null) && - (! sample_mask.contains(sample_names1[i]))) - { - continue; - } - - - VCFGenotypeRecord rec1 = genotypes1.get(i); - VCFGenotypeRecord rec2 = map2.get(sample_names1[i]); - - if (rec2 == null) { continue; } - - Long gq1; - if (rec1.getFields().get("GQ") != null) - { - Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ")); - gq1 = gq1_double.longValue(); - } - else - { - gq1 = 0L; - } - - Long gq2; - if (rec2.getFields().get("GQ") != null) - { - Double gq2_double = Double.parseDouble(rec2.getFields().get("GQ")); - gq2 = gq2_double.longValue(); - } - else - { - gq2 = 0L; - } - - List alleles1 = rec1.getAlleles(); - List alleles2 = rec2.getAlleles(); - - String g1 = ""; - String g2 = ""; - - for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); } - for (int j = 0; j < alleles2.size(); j++) { g2 += alleles2.get(j).getBases(); } - - char[] c1 = g1.toCharArray(); - char[] c2 = g2.toCharArray(); - - Arrays.sort(c1); - Arrays.sort(c2); - - g1 = new String(c1); - g2 = new String(c2); - - if (list_genotypes) - { - String flag = ""; - if (! g1.equals(g2)) { flag = "X"; } - output.printf("GENOTYPES " - + interval1.toString() - + " " + sample_names1[i] - + " " + g1 - + " " + g2 - + " " + gq1 - + " " + gq2 - + " " + flag + "\n"); - } - - if ((g1.equals("..")) || - (g2.equals(".."))) - { - continue; - } - - if (g1.charAt(0) == ref) { n_ref += 1; } else { n_alt += 1; } - if (g1.charAt(1) == ref) { n_ref += 1; } else { n_alt += 1; } - - if (! individual.containsKey(sample_names1[i])) { individual.put(sample_names1[i], new GenotypeConcordance(sample_names1[i])); } - if (! Qual.containsKey(gq1)) { Qual.put(gq1, new GenotypeConcordance(Long.toString(gq1))); } - - individual.get(sample_names1[i]).add(ref, g1, g2); - Qual.get(gq1).add(ref, g1, g2); - SNP.add(ref, g1, g2); - - } - - if (verbose) - { - //output.printf("SNP " + SNP.toString()); - output.printf("SNP " + SNP.toLine()); - } - - if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); } - AAF.get(n_alt).add(SNP); - - long r2_index = (long)(r2_1 / r2_bin_size); - if (! R2.containsKey(r2_index)) { R2.put(r2_index, new GenotypeConcordance(Double.toString(r2_1))); } - R2.get(r2_index).add(SNP); - - //System.out.printf("DBG: %f %f\n", r2_1, r2_2); - //System.out.printf("DBG: %f %d %s\n", r2_1, r2_index, SNP.toString()); - - record1 = reader1.next(); - record2 = reader2.next(); - } - else if (comparison > 0) - { - if (record2.isFiltered()) { record2 = reader2.next(); continue; } - - // interval1 is later than interval2. - Map info2 = record2.getInfoValues(); - number_sites_unique_to_file2 += 1; - if (VCFTool.isTransition(record2)) { unique2_ts += 1; } - else { unique2_tv += 1; } - if ((info2.get("DB") != null) && (Integer.parseInt(info2.get("DB")) == 1)) { unique2_dbsnp += 1; } - unique2_total += 1; - - //if (verbose) { output.printf("DBG: skipping %s\n", record2.toStringEncoding(header2)); } - - record2 = reader2.next(); - } - else if (comparison < 0) - { - if (record1.isFiltered()) { record1 = reader1.next(); continue; } - - // interval2 is later than interval1. - Map info1 = record1.getInfoValues(); - number_sites_unique_to_file1 += 1; - if (VCFTool.isTransition(record1)) { unique1_ts += 1; } - else { unique1_tv += 1; } - if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { unique1_dbsnp += 1; } - unique1_total += 1; - - //if (verbose) { output.printf("DBG: skipping %s\n", record1.toStringEncoding(header1)); } - - record1 = reader1.next(); - } - } - - - // Now output the statistics. - if (verbose) - { - output.printf("\n"); - Object[] individuals = individual.keySet().toArray(); - for (int i = 0; i < individuals.length; i++) - { - String ind = (String)individuals[i]; - output.print("INDIVIDUAL " + individual.get(ind).toString()); - } - - output.printf("\n"); - Object[] AAFs = AAF.keySet().toArray(); - for (int i = 0; i < AAFs.length; i++) - { - Long aaf = (Long)AAFs[i]; - output.print("AAF " + AAF.get(aaf).toString()); - } - - output.printf("\n"); - Object[] quals = Qual.keySet().toArray(); - for (int i = 0; i < quals.length; i++) - { - Long qual = (Long)quals[i]; - output.print("QUAL " + Qual.get(qual).toString()); - } - output.printf("\n"); - - output.printf("\n"); - Object[] R2s = R2.keySet().toArray(); - for (int i = 0; i < AAFs.length; i++) - { - Long r2 = (Long)R2s[i]; - output.print("R2 " + R2.get(r2).toString()); - } - } - - output.printf("Number of sites shared : %d %f %f\n", number_sites_shared, - (double)shared_ts/(double)shared_tv, - (double)shared_dbsnp/(double)(shared_ts+shared_tv)); - - output.printf("Number of sites unique to %s: %d %f %f\n", filename1, number_sites_unique_to_file1, - (double)unique1_ts/(double)unique1_tv, - (double)unique1_dbsnp/(double)(unique1_ts+unique1_tv)); - - output.printf("Number of sites unique to %s: %d %f %f\n", filename2, number_sites_unique_to_file2, - (double)unique2_ts/(double)unique2_tv, - (double)unique2_dbsnp/(double)(unique2_ts+unique2_tv)); - - output.printf("\n"); - Object[] individuals = individual.keySet().toArray(); - for (int i = 0; i < individuals.length; i++) - { - String ind = (String)individuals[i]; - output.printf("INDIVIDUAL %s %f %d %d\n", ind, individual.get(ind).errorRate(), individual.get(ind).total(), individual.get(ind).totalNonHomRef()); - } - - output.printf("\n"); - Object[] AAFs = AAF.keySet().toArray(); - for (int i = 0; i < AAFs.length; i++) - { - Long aaf = (Long)AAFs[i]; - output.printf("AAF %d %f %d %d %f\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef(), AAF.get(aaf).hetErrorRate()); - } - - output.printf("\n"); - Object[] quals = Qual.keySet().toArray(); - for (int i = 0; i < quals.length; i++) - { - Long qual = (Long)quals[i]; - output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef()); - } - - output.printf("\n"); - Object[] R2s = R2.keySet().toArray(); - for (int i = 0; i < R2s.length; i++) - { - Long r2 = (Long)R2s[i]; - output.printf("R2 %f %f %d %d\n", (double)r2 * r2_bin_size, R2.get(r2).errorRate(), R2.get(r2).total(), R2.get(r2).totalNonHomRef()); - } - - output.flush(); - output.close(); - - - return 0; - } -} - -public class VCFTool -{ - public static void main(String args[]) - { - // silence log4j messages. - //appender = new FileAppender(layout, clp.toFile, false); - //logger.addAppender(appender); - - SetupSequenceDictionary(); - - String mode = args[0]; - String[] realArgs = Arrays.copyOfRange(args, 1, args.length); - - if (mode.equals("validate")) - { - VCFValidate cm = new VCFValidate(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("grep")) - { - VCFGrep cm = new VCFGrep(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("concordance")) - { - VCFConcordance cm = new VCFConcordance(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("simple_stats")) - { - VCFSimpleStats cm = new VCFSimpleStats(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("printGQ")) - { - PrintGQ cm = new PrintGQ(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("fix_ref_fields")) - { - FixRefFields cm = new FixRefFields(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("check_ref_fields")) - { - CheckRefFields cm = new CheckRefFields(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("stats")) - { - VCFStats cm = new VCFStats(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("sequenom")) - { - VCFSequenomAnalysis cm = new VCFSequenomAnalysis(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("sequenom2")) - { - VCFSequenomAnalysis2 cm = new VCFSequenomAnalysis2(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("call_rates")) - { - VCFCallRates cm = new VCFCallRates(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("optimize")) - { - VCFOptimize cm = new VCFOptimize(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("apply_cuts")) - { - VCFApplyCuts cm = new VCFApplyCuts(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - if (mode.equals("merge")) - { - VCFMerge cm = new VCFMerge(); - CommandLineProgram.start(cm,realArgs); - System.exit(0); - } - - System.out.printf("ERROR: mode %s not defined.\n", mode); - System.exit(-1); - - } - - - ///////////////////////// - // Some helpful utilities. - - // Total hack to set up a sequence dictionary for 1kG hg18/build36 without needing to load a fasta. - public static SAMSequenceDictionary dict; - public static void SetupSequenceDictionary() - { - dict = new SAMSequenceDictionary(); - for (int i = 1; i <= 22; i++) - { - dict.addSequence(new SAMSequenceRecord(String.format("%d", i))); - } - dict.addSequence(new SAMSequenceRecord("X")); - dict.addSequence(new SAMSequenceRecord("Y")); - dict.addSequence(new SAMSequenceRecord("M")); - GenomeLocParser.setupRefContigOrdering(dict); - } - - public static Interval getIntervalFromRecord(VCFRecord record) - { - String chr = record.getChr(); - long off = record.getStart(); - return new Interval(chr, (int)off, (int)off); - } - - public static char getAlt(VCFRecord record) - { - List alleles = record.getAlternateAlleles(); - char alt = alleles.get(0).getBases().charAt(0); - return alt; - } - - public static boolean isTransition(VCFRecord record) - { - char ref = record.getReference().charAt(0); - List alleles = record.getAlternateAlleles(); - char alt = alleles.get(0).getBases().charAt(0); - - if (((ref == 'A') && (alt == 'G')) || - ((ref == 'G') && (alt == 'A')) || - ((ref == 'C') && (alt == 'T')) || - ((ref == 'T') && (alt == 'C'))) - { - return true; - } - else - { - return false; - } - } - - - public static int Compute_n_total(VCFRecord record) - { - return VCFTool.Compute_n_total(record, (String[])null); - } - - public static int Compute_n_total(VCFRecord record, String[] sample_names) - { - HashSet set = null; - if (sample_names != null) - { - set = new HashSet(); - for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); } - } - return VCFTool.Compute_n_total(record, set); - } - - public static int Compute_n_total(VCFRecord record, Set sample_mask) - { - String[] sample_names = record.getSampleNames(); - List genotypes = record.getVCFGenotypeRecords(); - int n_ref = 0; - int n_alt = 0; - for (int i = 0; i < sample_names.length; i++) - { - if ((sample_mask != null) && (! sample_mask.contains(sample_names[i]))) - { - continue; - } - - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - } - return n_alt + n_ref; - } - - public static int Compute_n_alt(VCFRecord record) - { - return VCFTool.Compute_n_alt(record, (String[])null); - } - - public static int Compute_n_alt(VCFRecord record, String[] sample_names) - { - HashSet set = null; - if (sample_names != null) - { - set = new HashSet(); - for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); } - } - return VCFTool.Compute_n_alt(record, set); - } - - public static int Compute_n_alt(VCFRecord record, Set sample_mask) - { - String[] sample_names = record.getSampleNames(); - List genotypes = record.getVCFGenotypeRecords(); - int n_ref = 0; - int n_alt = 0; - for (int i = 0; i < sample_names.length; i++) - { - // Skip samples we should skip. - if ((sample_mask != null) && (! sample_mask.contains(sample_names[i]))) - { - continue; - } - - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - } - return n_alt; - } - - - public static int Compute_n_het(VCFRecord record) - { - return VCFTool.Compute_n_het(record, (String[])null); - } - - public static int Compute_n_het(VCFRecord record, String[] sample_names) - { - HashSet set = null; - if (sample_names != null) - { - set = new HashSet(); - for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); } - } - return VCFTool.Compute_n_het(record, set); - } - - public static int Compute_n_het(VCFRecord record, Set sample_mask) - { - String[] sample_names = record.getSampleNames(); - List genotypes = record.getVCFGenotypeRecords(); - int n_het = 0; - for (int i = 0; i < sample_names.length; i++) - { - // Skip samples we should skip. - if ((sample_mask != null) && (! sample_mask.contains(sample_names[i]))) - { - continue; - } - - int n_ref = 0; - int n_alt = 0; - - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (n_alt == 1) { n_het += 1; } - } - return n_het; - } - - public static double Compute_failure_rate(VCFRecord record) - { - String[] sample_names = record.getSampleNames(); - List genotypes = record.getVCFGenotypeRecords(); - double failure_rate = 0.0; - for (int i = 0; i < sample_names.length; i++) - { - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { failure_rate += 1; continue; } - } - return failure_rate / (double)sample_names.length; - } - - public static double Compute_HWE(VCFRecord record) - { - return VCFTool.Compute_HWE(record, (String[])null); - } - - public static double Compute_HWE(VCFRecord record, String[] sample_names) - { - HashSet set = null; - if (sample_names != null) - { - set = new HashSet(); - for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); } - } - return VCFTool.Compute_HWE(record, set); - } - - public static double Compute_HWE(VCFRecord record, Set sample_mask) - { - int ref = 0; - int het = 0; - int hom = 0; - int N = 0; - - String[] sample_names = record.getSampleNames(); - List genotypes = record.getVCFGenotypeRecords(); - for (int i = 0; i < sample_names.length; i++) - { - // Skip samples we should skip. - if ((sample_mask != null) && (! sample_mask.contains(sample_names[i]))) - { - continue; - } - - int n_ref = 0; - int n_alt = 0; - - VCFGenotypeRecord rec = genotypes.get(i); - List alleles = rec.getAlleles(); - String g = ""; - for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); } - char[] c = g.toCharArray(); - Arrays.sort(c); - g = new String(c); - if (g.equals("..")) { continue; } - if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; } - - if (n_ref == 2) { ref += 1; } - else if (n_ref == 1 && n_alt == 1) { het += 1; } - else if (n_alt == 2) { hom += 1; } - - N += 1; - } - - double p = (2.0 * ref + het) / (2.0 * (ref + het + hom)); - double q = 1.0 - p; - - //System.out.printf("DBG: p=%f q=%f ref=%d het=%d hom=%d\n", p, q, ref, het, hom); - - double expected_ref = p * p * N; - double expected_het = 2.0 * p * q * N; - double expected_hom = q * q * N; - - double chi_squared = (Math.pow(ref - expected_ref,2)/expected_ref) + (Math.pow(het - expected_het,2)/expected_het) + (Math.pow(hom - expected_hom,2)/expected_hom); - - return chi_squared; - } - - // This function assumes a 1-degree of freedom chi-squared. - public static double P_from_Chi(double chi) - { - double gamma = 1.772454; - double a = Math.pow(2,0.5) * gamma; - double b = Math.pow(chi, 0.5-1.0) * Math.exp((-1.0 * chi)/2.0); - double ans = (1.0/a) * b; - return ans; - } - - public static int compareIntervals(Interval a, Interval b) - { - int chr_a; - int chr_b; - - if (a.getSequence().equals("X")) { chr_a = 23; } - else if (a.getSequence().equals("Y")) { chr_a = 24; } - else if (a.getSequence().equals("M")) { chr_a = 25; } - else { chr_a = Integer.parseInt(a.getSequence()); } - - if (b.getSequence().equals("X")) { chr_b = 23; } - else if (b.getSequence().equals("Y")) { chr_b = 24; } - else if (b.getSequence().equals("M")) { chr_b = 25; } - else { chr_b = Integer.parseInt(b.getSequence()); } - - int start_a = a.getStart(); - int start_b = b.getStart(); - - int end_a = a.getEnd(); - int end_b = b.getEnd(); - - if (chr_a < chr_b) { return -1; } - else if (chr_a > chr_b) { return 1; } - else if (start_a < start_b) { return -1; } - else if (start_a > start_b) { return 1; } - else if (end_a < end_b) { return -1; } - else if (end_a > end_b) { return 1; } - else { return 0; } - } - -} - diff --git a/archive/python/analyzeRecalQuals.py b/archive/python/analyzeRecalQuals.py deleted file mode 100755 index 3c3be4738..000000000 --- a/archive/python/analyzeRecalQuals.py +++ /dev/null @@ -1,413 +0,0 @@ -from __future__ import with_statement -import farm_commands -import os.path -import sys -from optparse import OptionParser -import picard_utils -from gatkConfigParser import * -import re -from itertools import * -import math -import operator - -MAX_QUAL_SCORE = 40 - -def phredQScore( nMismatches, nBases ): - """Calculates a phred-scaled score for nMismatches in nBases""" - #print 'phredQScore', nMismatches, nBases - if nMismatches == 0: - return MAX_QUAL_SCORE - elif nBases == 0: - return 0 - else: - return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE) - return r - - -def phredScore2ErrorProp(qual): - """Converts a phred-scaled quality score to an error probability""" - #print 'phredScore2ErrorProp', qual - return math.pow(10.0, float(qual) / -10.0) - -def tryByInt(s): - """Try to cast something to an int, or return it as a string""" - try: - return int(s) - except: - return s - -expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',') -defaultValues = '0,0,0,**,0,0,0'.split(',') -class RecalData(dict): - """Basic recalibration data -- corresponds exactly to the Java version in GATK""" - def __init__(self): - self.parse(expectedHeader, defaultValues) - - def parse(self, header, data): - """Parse the comma-separated data line with corresponding header. Throws an error - if the header doesn't correspond to the expectedHeader""" - # rg,pos,Qrep,dn,NBases,MMismatches,Qemp - types = [str, tryByInt, int, str, int, int, int] - for head, expected, datum, type in zip(header, expectedHeader, data, types): - if head <> expected: - raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum)) - #print 'Binding => ', head, type(datum) - self[head] = type(datum) - #print self - return self - - def set(self, header, values): - for head, val in zip(header, values): - self[head] = val - - def __getattr__(self, name): - return self[name] - - - # - # Trivial accessor functions - # - def readGroup(self): return self.rg - def dinuc(self): return self.dn - def qReported(self): return self.Qrep - def cycle(self): return self.pos - def getNBases(self): return self.nBases - def getNMismatches(self): return self.nMismatches - def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported()) - - - def qEmpirical(self): - #if OPTIONS.raw: - return self.Qemp - #else: - # r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1) - # #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp - # return r - - - def combine(self, moreData): - # grab useful info - sumErrors = self.nExpectedMismatches() - for datum in moreData: - self.nBases += datum.getNBases() - self.nMismatches += datum.getNMismatches() - sumErrors += datum.nExpectedMismatches() - self.updateQemp() - self.Qrep = phredQScore(sumErrors, self.getNBases()) - #print 'self.Qrep is now', self.Qrep - return self - - def updateQemp(self): - newQemp = phredQScore( self.getNMismatches(), self.getNBases() ) - #print 'Updating qEmp', self.Qemp, newQemp - self.Qemp = newQemp - return newQemp - - def __str__(self): - return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches()) - def __repr__(self): - return self.__str__() - -# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ): -# self.dinuc = dinuc -# self.Qrep = Qrep - -def rawDataStream(file): - """Yields successive lists containing the CSVs in the data file; excludes headers""" - header = None - for line in open(file): - if line.find("#") <> -1: continue - else: - data = line.strip().split(',') - if line.find("rg,") <> -1: - header = data - else: - yield RecalData().parse(header, data) - -def rawDataByReadGroup(rawDataFile): - """Yields a stream of the data in rawDataFile, grouped by readGroup""" - for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup): - yield (readGroup, list(generator)) - -def combineRecalData(separateData): - return RecalData().combine(separateData) - -def groupRecalData(allData, key=None): - s = sorted(allData, key=key) - values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ] - return sorted( values, key=lambda x: x[0]) - -# -# let's actually analyze the data! -# -def analyzeReadGroup(readGroup, data, outputRoot): - print 'Read group => ', readGroup - print 'Number of elements => ', len(data) - - files = [] - if OPTIONS.toStdout: - basicQualScoreStats(readGroup, data, sys.stdout ) - qReportedVsqEmpirical(readGroup, data, sys.stdout ) - qDiffByCycle(readGroup, data, sys.stdout) - qDiffByDinuc(readGroup, data, sys.stdout) - else: - def outputFile(tail): - file = outputRoot + tail - files.append(file) - return file - - with open(outputFile(".basic_info.dat"), 'w') as output: - basicQualScoreStats(readGroup, data, output ) - with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output: - qReportedVsqEmpirical(readGroup, data, output ) - with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output: - qDiffByCycle(readGroup, data, output) - with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output: - qDiffByDinuc(readGroup, data, output) - - print 'Files', files - return analyzeFiles(files) - -def countQsOfMinQuality(thres, data): - """Returns RecalData lists for each of the following: - All quality score bins with qRep > thres, and all quality scores with qRep and qRemp > thres""" - qDeclared = RecalData().combine(filter(lambda x: x.qReported() > thres, data)) - qDeclaredTrue = RecalData().combine(filter(lambda x: x.qReported() > thres and x.qEmpirical() > thres, data)) - #print qDeclared - return qDeclared, qDeclaredTrue - -def medianQreported(jaffe, allBases): - i, ignore = medianByCounts(map( RecalData.getNBases, jaffe )) - return jaffe[i].qReported() - -def medianByCounts(counts): - nTotal = lsum(counts) - sum = 0.0 - for i in range(len(counts)): - sum += counts[i] - if sum / nTotal > 0.5: - # The current datum contains the median - return i, counts[i] - -def modeQreported(jaffe, allBases): - ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True ) - #print ordered - return ordered[0].qReported() - -def averageQreported(jaffe, allBases): - # the average reported quality score is already calculated and stored as qRep! - return allBases.qReported() - -def lsum(inlist): - return reduce(operator.__add__, inlist, 0) - -def lsamplestdev (inlist, counts, mean): - """ - Returns the variance of the values in the passed list using - N for the denominator (i.e., DESCRIBES the sample variance only). - - Usage: lsamplevar(inlist)""" - n = lsum(counts) - sum = 0.0 - for item, count in zip(inlist, counts): - diff = item - mean - inc = count * diff * diff - #print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum - sum += inc - #print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1)) - return math.sqrt(sum / float(n-1)) - -def rmse(reportedList, empiricalList, counts): - sum = 0.0 - for reported, empirical, count in zip(reportedList, empiricalList, counts): - diff = reported - empirical - inc = count * diff * diff - sum += inc - #print reported, empirical, sum, inc, count, diff - #print sum, math.sqrt(sum) - return math.sqrt(sum) - -def stdevQReported(jaffe, allBases): - mean = averageQreported(jaffe, allBases) - return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean) - -def coeffOfVariationQreported(jaffe, allBases): - mean = averageQreported(jaffe, allBases) - stdev = stdevQReported(jaffe, allBases) - return stdev / mean - -def rmseJaffe(jaffe): - return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) ) - -def basicQualScoreStats(readGroup, data, output ): - def o(s): - print >> output, s - # aggregate all the data into a single datum - rg, allBases = groupRecalData(data, key=RecalData.readGroup)[0] - #o(allBases) - o("read_group %s" % rg) - #o("number_of_cycles %d" % 0) - #o("maximum_reported_quality_score %d" % 0) - o("number_of_bases %d" % allBases.getNBases()) - o("number_of_mismatching_bases %d" % allBases.getNMismatches()) - o("lane_wide_Qreported %2.2f" % allBases.qReported()) - o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical()) - o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported())) - - jaffe = [datum for key, datum in qReportedVsqEmpiricalStream(readGroup, data)] - - o("median_Qreported %2.2f" % medianQreported(jaffe, allBases)) - o("mode_Qreported %2.2f" % modeQreported(jaffe, allBases)) - o("average_Qreported %2.2f" % averageQreported(jaffe, allBases)) - o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases)) - o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases)) - - o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe)) - for thres in [20, 25, 30]: - qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe) - o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases())) - o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases()))) - o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases())) - o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases()))) - -def qDiffByCycle(readGroup, allData, output): - #print '#### qDiffByCycle ####' - print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' - print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases' - for cycle, datum in groupRecalData(allData, key=RecalData.cycle): - datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle]) - diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) - -def qDiffByDinuc(readGroup, allData, output): - print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' - print >> output, 'Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases' - for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc): - datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*']) - diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) - -def qReportedVsqEmpiricalStream(readGroup, data): - for key, datum in groupRecalData(data, key=RecalData.qReported): - datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*']) - yield key, datum - -def qReportedVsqEmpirical(readGroup, allData, output): - print >> output, 'Qreported Qempirical nMismatches nBases PercentBases' - rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0] - for key, datum in qReportedVsqEmpiricalStream(readGroup, allData): - #if datum.qReported() > 35: - # print datum - print >> output, "%2.2f %2.2f %12d %12d %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases())) - -def analyzeRawData(rawDataFile): - nReadGroups = 0 - for readGroup, data in rawDataByReadGroup(rawDataFile): - if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups: - nReadGroups += 1 - if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1: - break - else: - root, sourceFilename = os.path.split(rawDataFile) - if ( OPTIONS.outputDir ): root = OPTIONS.outputDir - outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' )) - analyzeReadGroup(readGroup, data, outputRoot) - -plottersByFile = { - "raw_data.csv$" : analyzeRawData, - "recal_data.csv$" : analyzeRawData, - "empirical_v_reported_quality" : 'PlotQEmpStated', - "quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc', - "quality_difference_v_cycle" : 'PlotQDiffByCycle' } - -def getPlotterForFile(file): - for pat, analysis in plottersByFile.iteritems(): - if re.search(pat, file): - if type(analysis) == str: - return config.getOption('R', analysis, 'input_file') - else: - analysis(file) - return None - -def analyzeFiles(files): - #print 'analyzeFiles', files - Rscript = config.getOption('R', 'Rscript', 'input_file') - for file in files: - print 'Analyzing file', file - plotter = getPlotterForFile(file) - if plotter <> None and not OPTIONS.noplots: - cmd = ' '.join([Rscript, plotter, file]) - farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) - -def main(): - global config, OPTIONS - usage = """usage: %prog -c config.cfg files*""" - - parser = OptionParser(usage=usage) - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-d", "--dir", dest="outputDir", - type="string", default=None, - help="If provided, analysis output files will be written to this directory") - parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups", - type="int", default=-1, - help="Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed") - parser.add_option("-c", "--config", dest="configs", - action="append", type="string", default=[], - help="Configuration file") - parser.add_option("-s", "--stdout", dest="toStdout", - action='store_true', default=False, - help="If provided, writes output to standard output, not to files") - parser.add_option("", "--no_plots", dest="noplots", - action='store_true', default=False, - help="If provided, no plots will be generated") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - #parser.add_option("-r", "--raw", dest="raw", - # action='store_true', default=False, - # help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each") - parser.add_option("-g", "--readGroup", dest="selectedReadGroups", - action="append", type="string", default=[], - help="If provided, only the provided read groups will be analyzed") - - (OPTIONS, args) = parser.parse_args() - #if len(args) != 3: - # parser.error("incorrect number of arguments") - - if len(OPTIONS.configs) == 0: - parser.error("Requires at least one configuration file be provided") - - config = gatkConfigParser(OPTIONS.configs) - - if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups - analyzeFiles(args) - -import unittest -class TestanalzyeRecalQuals(unittest.TestCase): - def setUp(self): - self.numbers = [0, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6, 6] - self.numbersItems = [0, 1, 2, 3, 4, 5, 6] - self.numbersCounts = [1, 1, 2, 1, 3, 3, 2] - self.numbers_sum = 47 - self.numbers_mean = 3.615385 - self.numbers_mode = 4 - self.numbers_median = 4 - self.numbers_stdev = 1.894662 - self.numbers_var = 3.589744 - self.numbers_cov = self.numbers_stdev / self.numbers_mean - - def testSum(self): - self.assertEquals(self.numbers_sum, lsum(self.numbers)) - self.assertEquals(0, lsum(self.numbers[0:0])) - self.assertEquals(1, lsum(self.numbers[0:2])) - self.assertEquals(3, lsum(self.numbers[0:3])) - - def teststdev(self): - self.assertAlmostEqual(self.numbers_stdev, lsamplestdev(self.numbersItems, self.numbersCounts, self.numbers_mean), 4) - -if __name__ == '__main__': - main() - #unittest.main() - diff --git a/archive/python/analyzeRecalQuals_1KG.py b/archive/python/analyzeRecalQuals_1KG.py deleted file mode 100755 index fb8647b92..000000000 --- a/archive/python/analyzeRecalQuals_1KG.py +++ /dev/null @@ -1,448 +0,0 @@ -from __future__ import with_statement -import os.path -import sys -from optparse import OptionParser -import re -from itertools import * -import math -import operator -import ConfigParser - -MAX_QUAL_SCORE = 40 - -defaultRequiredOptions = {} - -class gatkConfigParser(ConfigParser.SafeConfigParser): - GATK = 'DEFAULT' - - def __init__(self, configFiles): - ConfigParser.SafeConfigParser.__init__(self) - files = filter(None, configFiles) - print 'Reading configuration file(s):', files - self.read(files) - self.validateRequiredOptions() - - def validateRequiredOptions(self): - for key, value in defaultRequiredOptions.iteritems(): - self.validateOption(self.GATK, key, value) - - def validateOption(self, section, name, type = str): - v = self.getOption(section, name, type) - #print ' => Validated option', name, v - - def getGATKOption(self, name, type = str): - return self.getOption(self.GATK, name, type) - - def getGATKModeOption(self, name, mode, type = str): - return self.getOption(mode, name, type) - - def getOption(self, section, name, typeF = None): - if not self.has_option(section, name): - raise "Option %s not found in section %s" % (name, section) - else: - val = self.get(section, name) - if typeF == 'input_file' or typeF == 'output_file': - path = os.path.abspath(os.path.expanduser(val)) - if typeF == 'input_file': - if not os.path.exists(path): - raise "Input file does not exist", path - if not os.access(path, os.R_OK): - raise "Input file cannot be read", path - if typeF == 'output_file': - if not os.access(path, os.W_OK): - raise "Output file cannot be written", path - return path - elif type(typeF) == str: - return str(val) - elif typeF == None: - return val - else: - return typeF(val) - -def phredQScore( nMismatches, nBases ): - """Calculates a phred-scaled score for nMismatches in nBases""" - #print 'phredQScore', nMismatches, nBases - if nMismatches == 0: - return MAX_QUAL_SCORE - elif nBases == 0: - return 0 - else: - return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE) - return r - - -def phredScore2ErrorProp(qual): - """Converts a phred-scaled quality score to an error probability""" - #print 'phredScore2ErrorProp', qual - return math.pow(10.0, float(qual) / -10.0) - -def tryByInt(s): - """Try to cast something to an int, or return it as a string""" - try: - return int(s) - except: - return s - -expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',') -defaultValues = '0,0,0,**,0,0,0'.split(',') -class RecalData(dict): - """Basic recalibration data -- corresponds exactly to the Java version in GATK""" - def __init__(self): - self.parse(expectedHeader, defaultValues) - - def parse(self, header, data): - """Parse the comma-separated data line with corresponding header. Throws an error - if the header doesn't correspond to the expectedHeader""" - # rg,pos,Qrep,dn,NBases,MMismatches,Qemp - types = [str, tryByInt, int, str, int, int, int] - for head, expected, datum, type in zip(header, expectedHeader, data, types): - if head <> expected: - raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum)) - #print 'Binding => ', head, type(datum) - self[head] = type(datum) - #print self - return self - - def set(self, header, values): - for head, val in zip(header, values): - self[head] = val - - def __getattr__(self, name): - return self[name] - - - # - # Trivial accessor functions - # - def readGroup(self): return self.rg - def dinuc(self): return self.dn - def qReported(self): return self.Qrep - def cycle(self): return self.pos - def getNBases(self): return self.nBases - def getNMismatches(self): return self.nMismatches - def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported()) - - - def qEmpirical(self): - #if OPTIONS.raw: - return self.Qemp - #else: - # r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1) - # #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp - # return r - - - def combine(self, moreData): - # grab useful info - sumErrors = self.nExpectedMismatches() - for datum in moreData: - self.nBases += datum.getNBases() - self.nMismatches += datum.getNMismatches() - sumErrors += datum.nExpectedMismatches() - self.updateQemp() - self.Qrep = phredQScore(sumErrors, self.getNBases()) - #print 'self.Qrep is now', self.Qrep - return self - - def updateQemp(self): - newQemp = phredQScore( self.getNMismatches(), self.getNBases() ) - #print 'Updating qEmp', self.Qemp, newQemp - self.Qemp = newQemp - return newQemp - - def __str__(self): - return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches()) - def __repr__(self): - return self.__str__() - -# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ): -# self.dinuc = dinuc -# self.Qrep = Qrep - -def rawDataStream(file): - """Yields successive lists containing the CSVs in the data file; excludes headers""" - header = None - for line in open(file): - if line.find("#") <> -1: continue - else: - data = line.strip().split(',') - if line.find("rg,") <> -1: - header = data - else: - yield RecalData().parse(header, data) - -def rawDataByReadGroup(rawDataFile): - """Yields a stream of the data in rawDataFile, grouped by readGroup""" - for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup): - yield (readGroup, list(generator)) - -def combineRecalData(separateData): - return RecalData().combine(separateData) - -def groupRecalData(allData, key=None): - s = sorted(allData, key=key) - values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ] - return sorted( values, key=lambda x: x[0]) - -# -# let's actually analyze the data! -# -def analyzeReadGroup(readGroup, data, outputRoot): - print 'Read group => ', readGroup - print 'Number of elements => ', len(data) - - files = [] - if OPTIONS.toStdout: - basicQualScoreStats(readGroup, data, sys.stdout ) - qReportedVsqEmpirical(readGroup, data, sys.stdout ) - qDiffByCycle(readGroup, data, sys.stdout) - qDiffByDinuc(readGroup, data, sys.stdout) - else: - def outputFile(tail): - file = outputRoot + tail - files.append(file) - return file - - with open(outputFile(".basic_info.dat"), 'w') as output: - basicQualScoreStats(readGroup, data, output ) - with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output: - qReportedVsqEmpirical(readGroup, data, output ) - with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output: - qDiffByCycle(readGroup, data, output) - with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output: - qDiffByDinuc(readGroup, data, output) - - print 'Files', files - return analyzeFiles(files) - -def countQsOfMinQuality(thres, data): - """Returns RecalData lists for each of the following: - All quality score bins with qRep > thres, and all quality scores with qRep and qRemp > thres""" - qDeclared = RecalData().combine(filter(lambda x: x.qReported() > thres, data)) - qDeclaredTrue = RecalData().combine(filter(lambda x: x.qReported() > thres and x.qEmpirical() > thres, data)) - #print qDeclared - return qDeclared, qDeclaredTrue - -def medianQreported(jaffe, allBases): - i, ignore = medianByCounts(map( RecalData.getNBases, jaffe )) - return jaffe[i].qReported() - -def medianByCounts(counts): - nTotal = lsum(counts) - sum = 0.0 - for i in range(len(counts)): - sum += counts[i] - if sum / nTotal > 0.5: - # The current datum contains the median - return i, counts[i] - -def modeQreported(jaffe, allBases): - ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True ) - #print ordered - return ordered[0].qReported() - -def averageQreported(jaffe, allBases): - # the average reported quality score is already calculated and stored as qRep! - return allBases.qReported() - -def lsum(inlist): - return reduce(operator.__add__, inlist, 0) - -def lsamplestdev (inlist, counts, mean): - """ - Returns the variance of the values in the passed list using - N for the denominator (i.e., DESCRIBES the sample variance only). - - Usage: lsamplevar(inlist)""" - n = lsum(counts) - sum = 0.0 - for item, count in zip(inlist, counts): - diff = item - mean - inc = count * diff * diff - #print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum - sum += inc - #print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1)) - return math.sqrt(sum / float(n-1)) - -def rmse(reportedList, empiricalList, counts): - sum = 0.0 - for reported, empirical, count in zip(reportedList, empiricalList, counts): - diff = reported - empirical - inc = count * diff * diff - sum += inc - #print reported, empirical, sum, inc, count, diff - #print sum, math.sqrt(sum) - return math.sqrt(sum) - -def stdevQReported(jaffe, allBases): - mean = averageQreported(jaffe, allBases) - return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean) - -def coeffOfVariationQreported(jaffe, allBases): - mean = averageQreported(jaffe, allBases) - stdev = stdevQReported(jaffe, allBases) - return stdev / mean - -def rmseJaffe(jaffe): - return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) ) - -def basicQualScoreStats(readGroup, data, output ): - def o(s): - print >> output, s - # aggregate all the data into a single datum - rg, allBases = groupRecalData(data, key=RecalData.readGroup)[0] - #o(allBases) - o("read_group %s" % rg) - #o("number_of_cycles %d" % 0) - #o("maximum_reported_quality_score %d" % 0) - o("number_of_bases %d" % allBases.getNBases()) - o("number_of_mismatching_bases %d" % allBases.getNMismatches()) - o("lane_wide_Qreported %2.2f" % allBases.qReported()) - o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical()) - o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported())) - - jaffe = [datum for key, datum in qReportedVsqEmpiricalStream(readGroup, data)] - - o("median_Qreported %2.2f" % medianQreported(jaffe, allBases)) - o("mode_Qreported %2.2f" % modeQreported(jaffe, allBases)) - o("average_Qreported %2.2f" % averageQreported(jaffe, allBases)) - o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases)) - o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases)) - - o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe)) - for thres in [20, 25, 30]: - qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe) - o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases())) - o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases()))) - o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases())) - o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases()))) - -def qDiffByCycle(readGroup, allData, output): - #print '#### qDiffByCycle ####' - print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' - print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases' - for cycle, datum in groupRecalData(allData, key=RecalData.cycle): - datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle]) - diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) - -def qDiffByDinuc(readGroup, allData, output): - print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' - print >> output, 'Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases' - for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc): - datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*']) - diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) - -def qReportedVsqEmpiricalStream(readGroup, data): - for key, datum in groupRecalData(data, key=RecalData.qReported): - datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*']) - yield key, datum - -def qReportedVsqEmpirical(readGroup, allData, output): - print >> output, 'Qreported Qempirical nMismatches nBases PercentBases' - rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0] - for key, datum in qReportedVsqEmpiricalStream(readGroup, allData): - #if datum.qReported() > 35: - # print datum - print >> output, "%2.2f %2.2f %12d %12d %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases())) - -def analyzeRawData(rawDataFile): - nReadGroups = 0 - for readGroup, data in rawDataByReadGroup(rawDataFile): - if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups: - nReadGroups += 1 - if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1: - break - else: - root, sourceFilename = os.path.split(rawDataFile) - if ( OPTIONS.outputDir ): root = OPTIONS.outputDir - outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' )) - analyzeReadGroup(readGroup, data, outputRoot) - -plottersByFile = { - ".csv$" : analyzeRawData, - "empirical_v_reported_quality" : 'PlotQEmpStated', - "quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc', - "quality_difference_v_cycle" : 'PlotQDiffByCycle' } - -def getPlotterForFile(file): - for pat, analysis in plottersByFile.iteritems(): - if re.search(pat, file): - if type(analysis) == str: - return config.getOption('R', analysis, 'input_file') - else: - analysis(file) - return None - -def analyzeFiles(files): - #print 'analyzeFiles', files - Rscript = config.getOption('R', 'Rscript', 'input_file') - for file in files: - print 'Analyzing file', file - plotter = getPlotterForFile(file) - if plotter <> None and not OPTIONS.noplots: - cmd = ' '.join([Rscript, plotter, file]) - status = os.system(cmd) - -def main(): - global config, OPTIONS - usage = """usage: %prog -c config.cfg files*""" - - parser = OptionParser(usage=usage) - parser.add_option("-d", "--dir", dest="outputDir", - type="string", default=None, - help="If provided, analysis output files will be written to this directory") - parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups", - type="int", default=-1, - help="Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed") - parser.add_option("-c", "--config", dest="configs", - action="append", type="string", default=[], - help="Configuration file") - parser.add_option("-s", "--stdout", dest="toStdout", - action='store_true', default=False, - help="If provided, writes output to standard output, not to files") - parser.add_option("", "--no_plots", dest="noplots", - action='store_true', default=False, - help="If provided, no plots will be generated") - parser.add_option("-g", "--readGroup", dest="selectedReadGroups", - action="append", type="string", default=[], - help="If provided, only the provided read groups will be analyzed") - - (OPTIONS, args) = parser.parse_args() - - if len(OPTIONS.configs) == 0: - parser.error("Requires at least one configuration file be provided") - - config = gatkConfigParser(OPTIONS.configs) - - if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups - analyzeFiles(args) - -import unittest -class TestanalzyeRecalQuals(unittest.TestCase): - def setUp(self): - self.numbers = [0, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6, 6] - self.numbersItems = [0, 1, 2, 3, 4, 5, 6] - self.numbersCounts = [1, 1, 2, 1, 3, 3, 2] - self.numbers_sum = 47 - self.numbers_mean = 3.615385 - self.numbers_mode = 4 - self.numbers_median = 4 - self.numbers_stdev = 1.894662 - self.numbers_var = 3.589744 - self.numbers_cov = self.numbers_stdev / self.numbers_mean - - def testSum(self): - self.assertEquals(self.numbers_sum, lsum(self.numbers)) - self.assertEquals(0, lsum(self.numbers[0:0])) - self.assertEquals(1, lsum(self.numbers[0:2])) - self.assertEquals(3, lsum(self.numbers[0:3])) - - def teststdev(self): - self.assertAlmostEqual(self.numbers_stdev, lsamplestdev(self.numbersItems, self.numbersCounts, self.numbers_mean), 4) - -if __name__ == '__main__': - main() - #unittest.main() - diff --git a/build.xml b/build.xml index 34dcc05fb..4cdae6f1e 100644 --- a/build.xml +++ b/build.xml @@ -29,12 +29,16 @@ - + + + + - + + @@ -68,17 +72,16 @@ - - - + + - + - + @@ -88,7 +91,10 @@ - + + + + @@ -157,15 +163,15 @@ - + - - + + @@ -204,15 +210,8 @@ - - - - - - - - - + + @@ -251,10 +250,13 @@ - - - - + + + + + + + @@ -266,9 +268,17 @@ - - - + + + + + + + + + + + @@ -319,7 +329,8 @@ Building Scala... - + + @@ -342,7 +353,7 @@ - + @@ -354,7 +365,8 @@ - + + @@ -437,8 +449,6 @@ - - @@ -466,7 +476,7 @@ - + @@ -510,7 +520,7 @@ - + @@ -543,7 +553,7 @@ - + @@ -565,7 +575,7 @@ - + @@ -575,16 +585,10 @@ - + - - - - - - - - + + @@ -610,13 +614,15 @@ - + + + @@ -813,12 +823,18 @@ - + + + + + - + + diff --git a/c/SeparateQltout.cc b/c/SeparateQltout.cc deleted file mode 100644 index 7644c9603..000000000 --- a/c/SeparateQltout.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include "MainTools.h" -#include "Basevector.h" -#include "lookup/LookAlign.h" -#include "lookup/SerialQltout.h" - -unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) { - //la.PrintParseable(cout); - - for (int i = 0; i < candidates.size(); i++) { - look_align newla = la; - - if (newla.rc1) { candidates[i].ReverseComplement(); } - newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]); - - //newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]); - //cout << newla.Errors() << " " << la.Errors() << endl; - - if (newla.Errors() == la.Errors()) { - return i; - } - } - - //FatalErr("Query id " + ToString(la.query_id) + " had no matches."); - - return candidates.size() + 1; -} - -int main(int argc, char **argv) { - RunTime(); - - BeginCommandArguments; - CommandArgument_String(ALIGNS); - CommandArgument_String(FASTB_END_1); - CommandArgument_String(FASTB_END_2); - CommandArgument_String(REFERENCE); - - CommandArgument_String(ALIGNS_END_1_OUT); - CommandArgument_String(ALIGNS_END_2_OUT); - EndCommandArguments; - - vecbasevector ref(REFERENCE); - vecbasevector reads1(FASTB_END_1); - vecbasevector reads2(FASTB_END_2); - - ofstream aligns1stream(ALIGNS_END_1_OUT.c_str()); - ofstream aligns2stream(ALIGNS_END_2_OUT.c_str()); - - basevector bv; - - SerialQltout sqltout(ALIGNS); - look_align la; - while (sqltout.Next(la)) { - vecbasevector candidates(2); - candidates[0] = reads1[la.query_id]; - candidates[1] = reads2[la.query_id]; - - unsigned int matchingend = MatchingEnd(la, candidates, ref); - if (matchingend < 2) { - bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id]; - - //la.PrintParseable(cout, &bv, &ref[la.target_id]); - la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]); - } - } - - aligns1stream.close(); - aligns2stream.close(); - - return 0; -} diff --git a/c/bwa/Makefile b/c/bwa/Makefile deleted file mode 100644 index 6399a0e6d..000000000 --- a/c/bwa/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -CXX=g++ -CXXFLAGS=-g -Wall -O2 -m64 -fPIC - -.cpp.o: - $(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@ - -all: init lib - -init: - @echo Please make sure the following platforms are set correctly on your machine. - @echo BWA_HOME=$(BWA_HOME) - @echo JAVA_INCLUDE=$(JAVA_INCLUDE) - @echo TARGET_LIB=$(TARGET_LIB) - @echo EXTRA_LIBS=$(EXTRA_LIBS) - @echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND) - -lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o - $(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS) - -clean: - rm *.o libbwa.* diff --git a/c/bwa/build_linux.sh b/c/bwa/build_linux.sh deleted file mode 100755 index c713f3963..000000000 --- a/c/bwa/build_linux.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa" -export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux" -export TARGET_LIB="libbwa.so" -export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread" -export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so" -make diff --git a/c/bwa/build_mac.sh b/c/bwa/build_mac.sh deleted file mode 100644 index bfed900bb..000000000 --- a/c/bwa/build_mac.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -export BWA_HOME="/Users/mhanna/src/bwa" -export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers" -export TARGET_LIB="libbwa.dylib" -export EXTRA_LIBS="-lc -lz -lsupc++" -export LIBTOOL_COMMAND="libtool -dynamic" -make diff --git a/c/bwa/bwa_gateway.cpp b/c/bwa/bwa_gateway.cpp deleted file mode 100644 index 3f6850e37..000000000 --- a/c/bwa/bwa_gateway.cpp +++ /dev/null @@ -1,268 +0,0 @@ -#include -#include - -#include "bwase.h" -#include "bwa_gateway.h" - -BWA::BWA(const char* ann_filename, - const char* amb_filename, - const char* pac_filename, - const char* forward_bwt_filename, - const char* forward_sa_filename, - const char* reverse_bwt_filename, - const char* reverse_sa_filename) -{ - // Load the bns (?) and reference - bns = bns_restore_core(ann_filename,amb_filename,pac_filename); - reference = new ubyte_t[bns->l_pac/4+1]; - rewind(bns->fp_pac); - fread(reference, 1, bns->l_pac/4+1, bns->fp_pac); - fclose(bns->fp_pac); - bns->fp_pac = NULL; - - // Load the BWTs (both directions) and suffix arrays (both directions) - bwts[0] = bwt_restore_bwt(forward_bwt_filename); - bwt_restore_sa(forward_sa_filename, bwts[0]); - bwts[1] = bwt_restore_bwt(reverse_bwt_filename); - bwt_restore_sa(reverse_sa_filename, bwts[1]); - load_default_options(); - - // initialize the bwase subsystem - bwase_initialize(); -} - -BWA::~BWA() { - delete[] reference; - bns_destroy(bns); - bwt_destroy(bwts[0]); - bwt_destroy(bwts[1]); -} - -void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) -{ - bwa_seq_t* sequence = create_sequence(bases, read_length); - - // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). - // This method will destroy the contents of seq and rseq. - bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); - - paths = new bwt_aln1_t[sequence->n_aln]; - memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); - num_paths = sequence->n_aln; - - // Call aln2seq to initialize the type of match present. - bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); - best_path_count = sequence->c1; - second_best_path_count = sequence->c2; - - bwa_free_read_seq(1,sequence); -} - -Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) { - bwa_seq_t* sequence = create_sequence(bases,read_length); - - // Calculate paths. - bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); - - // Check for no alignments found and return null. - if(sequence->n_aln == 0) { - bwa_free_read_seq(1,sequence); - return NULL; - } - - // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in. - copy_bases_into_sequence(sequence,bases,read_length); - - // Pick best alignment and propagate its information into the sequence. - bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); - - // Generate the best alignment from the sequence. - Alignment* alignment = new Alignment; - *alignment = generate_final_alignment_from_sequence(sequence); - - bwa_free_read_seq(1,sequence); - - return alignment; -} - -void BWA::generate_alignments_from_paths(const char* bases, - const unsigned read_length, - bwt_aln1_t* paths, - const unsigned num_paths, - const unsigned best_count, - const unsigned second_best_count, - Alignment*& alignments, - unsigned& num_alignments) -{ - bwa_seq_t* sequence = create_sequence(bases,read_length); - - sequence->aln = paths; - sequence->n_aln = num_paths; - - // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. - bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); - - // But overwrite key parts of the sequence in case the user passed back only a smaller subset - // of the paths. - sequence->c1 = best_count; - sequence->c2 = second_best_count; - sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; - - num_alignments = 0; - for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) - num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; - - alignments = new Alignment[num_alignments]; - unsigned alignment_idx = 0; - - for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { - // Stub in a 'working' path, so that only the desired alignment is local-aligned. - const bwt_aln1_t* path = paths + path_idx; - bwt_aln1_t working_path = *path; - - // Loop through all alignments, aligning each one individually. - for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { - working_path.k = working_path.l = sa_idx; - sequence->aln = &working_path; - sequence->n_aln = 1; - - sequence->sa = sa_idx; - sequence->strand = path->a; - sequence->score = path->score; - - // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. - // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. - if(alignment_idx > 0) - seq_reverse(sequence->len, sequence->seq, 0); - - // Copy the local alignment data into the alignment object. - *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); - - alignment_idx++; - } - } - - sequence->aln = NULL; - sequence->n_aln = 0; - - bwa_free_read_seq(1,sequence); -} - -Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) { - // Calculate the local coordinate and local alignment. - bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr); - bwa_refine_gapped(bns, 1, sequence, reference, NULL); - - // Copy the local alignment data into the alignment object. - Alignment alignment; - - // Populate basic path info - alignment.edit_distance = sequence->nm; - alignment.num_mismatches = sequence->n_mm; - alignment.num_gap_opens = sequence->n_gapo; - alignment.num_gap_extensions = sequence->n_gape; - alignment.num_best = sequence->c1; - alignment.num_second_best = sequence->c2; - - // Final alignment position. - alignment.type = sequence->type; - bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig); - alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1; - alignment.negative_strand = sequence->strand; - alignment.mapping_quality = sequence->mapQ; - - // Cigar step. - alignment.cigar = NULL; - if(sequence->cigar) { - alignment.cigar = new uint16_t[sequence->n_cigar]; - memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t)); - } - alignment.n_cigar = sequence->n_cigar; - - // MD tag with a better breakdown of differences in the cigar - alignment.md = strdup(sequence->md); - delete[] sequence->md; - sequence->md = NULL; - - return alignment; -} - -void BWA::load_default_options() -{ - options.s_mm = 3; - options.s_gapo = 11; - options.s_gape = 4; - options.mode = 3; - options.indel_end_skip = 5; - options.max_del_occ = 10; - options.max_entries = 2000000; - options.fnr = 0.04; - options.max_diff = -1; - options.max_gapo = 1; - options.max_gape = 6; - options.max_seed_diff = 2; - options.seed_len = 2147483647; - options.n_threads = 1; - options.max_top2 = 30; - options.trim_qual = 0; -} - -void BWA::set_max_edit_distance(float edit_distance) { - if(edit_distance > 0 && edit_distance < 1) { - options.fnr = edit_distance; - options.max_diff = -1; - } - else { - options.fnr = -1.0; - options.max_diff = (int)edit_distance; - } -} - -void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; } -void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; } -void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; } -void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; } -void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; } -void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; } - -/** - * Create a sequence with a set of reasonable initial defaults. - * Will leave seq and rseq empty. - */ -bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length) -{ - bwa_seq_t* sequence = new bwa_seq_t; - - sequence->tid = -1; - - sequence->name = 0; - - copy_bases_into_sequence(sequence, bases, read_length); - - sequence->qual = 0; - sequence->aln = 0; - sequence->md = 0; - - sequence->cigar = NULL; - sequence->n_cigar = 0; - - sequence->multi = NULL; - sequence->n_multi = 0; - - return sequence; -} - -void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length) -{ - // seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap - sequence->seq = new ubyte_t[read_length]; - sequence->rseq = new ubyte_t[read_length]; - for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]]; - memcpy(sequence->rseq,sequence->seq,read_length); - - // BWA expects the read bases to arrive reversed. - seq_reverse(read_length,sequence->seq,0); - seq_reverse(read_length,sequence->rseq,1); - - sequence->full_len = sequence->len = read_length; -} diff --git a/c/bwa/bwa_gateway.h b/c/bwa/bwa_gateway.h deleted file mode 100644 index 0ef0a129b..000000000 --- a/c/bwa/bwa_gateway.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef BWA_GATEWAY -#define BWA_GATEWAY - -#include - -#include "bntseq.h" -#include "bwt.h" -#include "bwtaln.h" - -class Alignment { - public: - uint32_t type; - int contig; - bwtint_t pos; - bool negative_strand; - uint32_t mapping_quality; - - uint16_t *cigar; - int n_cigar; - - uint8_t num_mismatches; - uint8_t num_gap_opens; - uint8_t num_gap_extensions; - uint16_t edit_distance; - - uint32_t num_best; - uint32_t num_second_best; - - char* md; -}; - -class BWA { - private: - bntseq_t *bns; - ubyte_t* reference; - bwt_t* bwts[2]; - gap_opt_t options; - - void load_default_options(); - bwa_seq_t* create_sequence(const char* bases, const unsigned read_length); - void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length); - Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence); - - public: - BWA(const char* ann_filename, - const char* amb_filename, - const char* pac_filename, - const char* forward_bwt_filename, - const char* forward_sa_filename, - const char* reverse_bwt_filename, - const char* reverse_sa_filename); - ~BWA(); - - // Parameterize the aligner. - void set_max_edit_distance(float edit_distance); - void set_max_gap_opens(int max_gap_opens); - void set_max_gap_extensions(int max_gap_extensions); - void set_disallow_indel_within_range(int indel_range); - void set_mismatch_penalty(int penalty); - void set_gap_open_penalty(int penalty); - void set_gap_extension_penalty(int penalty); - - // Perform the alignment - Alignment* generate_single_alignment(const char* bases, - const unsigned read_length); - void find_paths(const char* bases, - const unsigned read_length, - bwt_aln1_t*& paths, - unsigned& num_paths, - unsigned& best_path_count, - unsigned& second_best_path_count); - void generate_alignments_from_paths(const char* bases, - const unsigned read_length, - bwt_aln1_t* paths, - const unsigned num_paths, - const unsigned best_count, - const unsigned second_best_count, - Alignment*& alignments, - unsigned& num_alignments); -}; - -#endif // BWA_GATEWAY diff --git a/c/bwa/libbwa.so.1 b/c/bwa/libbwa.so.1 deleted file mode 100755 index bfa3c2847..000000000 Binary files a/c/bwa/libbwa.so.1 and /dev/null differ diff --git a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp deleted file mode 100644 index 1ccbef0d4..000000000 --- a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp +++ /dev/null @@ -1,437 +0,0 @@ -#include -#include -#include - -#include "bntseq.h" -#include "bwt.h" -#include "bwtaln.h" -#include "bwa_gateway.h" -#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h" - -typedef void (BWA::*int_setter)(int value); -typedef void (BWA::*float_setter)(float value); - -static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment); -static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name); -static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter); -static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter); -static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message); - -JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration) -{ - jstring java_ann = get_configuration_file(env,bwtFiles,"annFile"); - if(java_ann == NULL) return 0L; - jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile"); - if(java_amb == NULL) return 0L; - jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile"); - if(java_pac == NULL) return 0L; - jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile"); - if(java_forward_bwt == NULL) return 0L; - jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile"); - if(java_forward_sa == NULL) return 0L; - jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile"); - if(java_reverse_bwt == NULL) return 0L; - jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile"); - if(java_reverse_sa == NULL) return 0L; - - const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE); - if(env->ExceptionCheck()) return 0L; - - BWA* bwa = new BWA(ann_filename, - amb_filename, - pac_filename, - forward_bwt_filename, - forward_sa_filename, - reverse_bwt_filename, - reverse_sa_filename); - - Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration); - if(env->ExceptionCheck()) return 0L; - - env->ReleaseStringUTFChars(java_ann,ann_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_amb,amb_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_pac,pac_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename); - if(env->ExceptionCheck()) return 0L; - env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename); - if(env->ExceptionCheck()) return 0L; - - return (jlong)bwa; -} - -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa) -{ - BWA* bwa = (BWA*)java_bwa; - delete bwa; -} - -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) { - BWA* bwa = (BWA*)java_bwa; - set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty); - if(env->ExceptionCheck()) return; - set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty); - if(env->ExceptionCheck()) return; -} - -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) -{ - BWA* bwa = (BWA*)java_bwa; - - const jsize read_length = env->GetArrayLength(java_bases); - if(env->ExceptionCheck()) return NULL; - - jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); - if(read_bases == NULL) return NULL; - - bwt_aln1_t* paths = NULL; - unsigned num_paths = 0; - - unsigned best_path_count, second_best_path_count; - bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count); - - jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL); - if(java_paths == NULL) return NULL; - - for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { - bwt_aln1_t& path = *(paths + path_idx); - - jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"); - if(java_path_class == NULL) return NULL; - - jmethodID java_path_constructor = env->GetMethodID(java_path_class, "", "(IIIZJJIII)V"); - if(java_path_constructor == NULL) return NULL; - - // Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints. - jobject java_path = env->NewObject(java_path_class, - java_path_constructor, - path.n_mm, - path.n_gapo, - path.n_gape, - path.a, - (jlong)path.k, - (jlong)path.l, - path.score, - best_path_count, - second_best_path_count); - if(java_path == NULL) return NULL; - - env->SetObjectArrayElement(java_paths,path_idx,java_path); - if(env->ExceptionCheck()) return NULL; - - env->DeleteLocalRef(java_path_class); - if(env->ExceptionCheck()) return NULL; - } - - delete[] paths; - - env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); - - return env->ExceptionCheck() ? NULL : java_paths; -} - -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths) -{ - BWA* bwa = (BWA*)java_bwa; - - const jsize read_length = env->GetArrayLength(java_bases); - if(env->ExceptionCheck()) return NULL; - - jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); - if(read_bases == NULL) return NULL; - - const jsize num_paths = env->GetArrayLength(java_paths); - bwt_aln1_t* paths = new bwt_aln1_t[num_paths]; - unsigned best_count = 0, second_best_count = 0; - - for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { - jobject java_path = env->GetObjectArrayElement(java_paths,path_idx); - jclass java_path_class = env->GetObjectClass(java_path); - if(java_path_class == NULL) return NULL; - - bwt_aln1_t& path = *(paths + path_idx); - - jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I"); - if(mismatches_field == NULL) return NULL; - path.n_mm = env->GetIntField(java_path,mismatches_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I"); - if(gap_opens_field == NULL) return NULL; - path.n_gapo = env->GetIntField(java_path,gap_opens_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I"); - if(gap_extensions_field == NULL) return NULL; - path.n_gape = env->GetIntField(java_path,gap_extensions_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z"); - if(negative_strand_field == NULL) return NULL; - path.a = env->GetBooleanField(java_path,negative_strand_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID k_field = env->GetFieldID(java_path_class, "k", "J"); - if(k_field == NULL) return NULL; - path.k = env->GetLongField(java_path,k_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID l_field = env->GetFieldID(java_path_class, "l", "J"); - if(l_field == NULL) return NULL; - path.l = env->GetLongField(java_path,l_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID score_field = env->GetFieldID(java_path_class, "score", "I"); - if(score_field == NULL) return NULL; - path.score = env->GetIntField(java_path,score_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I"); - if(best_count_field == NULL) return NULL; - best_count = env->GetIntField(java_path,best_count_field); - if(env->ExceptionCheck()) return NULL; - - jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I"); - if(second_best_count_field == NULL) return NULL; - second_best_count = env->GetIntField(java_path,second_best_count_field); - if(env->ExceptionCheck()) return NULL; - } - - Alignment* alignments = NULL; - unsigned num_alignments = 0; - bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments); - - jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL); - if(java_alignments == NULL) return NULL; - - for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) { - Alignment& alignment = *(alignments + alignment_idx); - jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment); - if(java_alignment == NULL) return NULL; - env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment); - if(env->ExceptionCheck()) return NULL; - } - - delete[] alignments; - delete[] paths; - - env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); - - return env->ExceptionCheck() ? NULL : java_alignments; -} - -JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) { - BWA* bwa = (BWA*)java_bwa; - - const jsize read_length = env->GetArrayLength(java_bases); - if(env->ExceptionCheck()) return NULL; - - jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); - if(read_bases == NULL) return NULL; - - Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length); - jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL; - delete best_alignment; - - env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); - - return java_best_alignment; -} - -static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) { - unsigned cigar_length; - if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0; - else if(!alignment.cigar) cigar_length = 1; - else cigar_length = alignment.n_cigar; - - jcharArray java_cigar_operators = env->NewCharArray(cigar_length); - if(java_cigar_operators == NULL) return NULL; - jintArray java_cigar_lengths = env->NewIntArray(cigar_length); - if(java_cigar_lengths == NULL) return NULL; - - if(alignment.cigar) { - for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) { - jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14]; - jint cigar_length = alignment.cigar[cigar_idx]&0x3fff; - - env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator); - if(env->ExceptionCheck()) return NULL; - env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length); - if(env->ExceptionCheck()) return NULL; - } - } - else { - if(alignment.type != BWA_TYPE_NO_MATCH) { - jchar cigar_operator = 'M'; - env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator); - if(env->ExceptionCheck()) return NULL; - env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length); - if(env->ExceptionCheck()) return NULL; - } - } - delete[] alignment.cigar; - - jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment"); - if(java_alignment_class == NULL) return NULL; - - jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "", "(IIZI[C[IILjava/lang/String;IIIII)V"); - if(java_alignment_constructor == NULL) return NULL; - - jstring java_md = env->NewStringUTF(alignment.md); - if(java_md == NULL) return NULL; - delete[] alignment.md; - - jobject java_alignment = env->NewObject(java_alignment_class, - java_alignment_constructor, - alignment.contig, - alignment.pos, - alignment.negative_strand, - alignment.mapping_quality, - java_cigar_operators, - java_cigar_lengths, - alignment.edit_distance, - java_md, - alignment.num_mismatches, - alignment.num_gap_opens, - alignment.num_gap_extensions, - alignment.num_best, - alignment.num_second_best); - if(java_alignment == NULL) return NULL; - - env->DeleteLocalRef(java_alignment_class); - if(env->ExceptionCheck()) return NULL; - - return java_alignment; -} - -static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) { - jclass configuration_class = env->GetObjectClass(configuration); - if(configuration_class == NULL) return NULL; - - jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;"); - if(configuration_field == NULL) return NULL; - - jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field); - - jclass file_class = env->FindClass("java/io/File"); - if(file_class == NULL) return NULL; - - jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;"); - if(path_extractor == NULL) return NULL; - - jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor); - if(path == NULL) return NULL; - - env->DeleteLocalRef(configuration_class); - env->DeleteLocalRef(file_class); - env->DeleteLocalRef(configuration_file); - - return path; -} - -static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) { - jclass configuration_class = env->GetObjectClass(configuration); - if(configuration_class == NULL) return; - - jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;"); - if(configuration_field == NULL) return; - - jobject boxed_value = env->GetObjectField(configuration,configuration_field); - if(env->ExceptionCheck()) return; - - if(boxed_value != NULL) { - jclass int_box_class = env->FindClass("java/lang/Integer"); - if(int_box_class == NULL) return; - - jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I"); - if(int_extractor == NULL) return; - - jint value = env->CallIntMethod(boxed_value,int_extractor); - if(env->ExceptionCheck()) return; - - if(value < 0) - { - throw_config_value_exception(env,field_name,"cannot be set to a negative value"); - return; - } - - (bwa->*setter)(value); - - env->DeleteLocalRef(int_box_class); - } - - env->DeleteLocalRef(boxed_value); - env->DeleteLocalRef(configuration_class); -} - -static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter) -{ - jclass configuration_class = env->GetObjectClass(configuration); - if(configuration_class == NULL) return; - - jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;"); - if(configuration_field == NULL) return; - - jobject boxed_value = env->GetObjectField(configuration,configuration_field); - if(boxed_value != NULL) { - jclass float_box_class = env->FindClass("java/lang/Float"); - if(float_box_class == NULL) return; - - jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F"); - if(float_extractor == NULL) return; - - jfloat value = env->CallFloatMethod(boxed_value,float_extractor); - if(env->ExceptionCheck()) return; - - if(value < 0) - { - throw_config_value_exception(env,field_name,"cannot be set to a negative value"); - return; - } - - (bwa->*setter)(value); - - env->DeleteLocalRef(float_box_class); - } - - env->DeleteLocalRef(boxed_value); - env->DeleteLocalRef(configuration_class); -} - -static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message) -{ - char* buffer = new char[strlen(field_name)+1+strlen(message)+1]; - sprintf(buffer,"%s %s",field_name,message); - jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException"); - if(sting_exception_class == NULL) return; - env->ThrowNew(sting_exception_class, buffer); - delete[] buffer; -} diff --git a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h deleted file mode 100644 index 0c44e430a..000000000 --- a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h +++ /dev/null @@ -1,61 +0,0 @@ -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */ - -#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner -#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner -#ifdef __cplusplus -extern "C" { -#endif -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: create - * Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J - */ -JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create - (JNIEnv *, jobject, jobject, jobject); - -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: updateConfiguration - * Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V - */ -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration - (JNIEnv *, jobject, jlong, jobject); - -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: destroy - * Signature: (J)V - */ -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy - (JNIEnv *, jobject, jlong); - -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: getPaths - * Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath; - */ -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths - (JNIEnv *, jobject, jlong, jbyteArray); - -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: convertPathsToAlignments - * Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment; - */ -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments - (JNIEnv *, jobject, jlong, jbyteArray, jobjectArray); - -/* - * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: getBestAlignment - * Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment; - */ -JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment - (JNIEnv *, jobject, jlong, jbyteArray); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/c/libenvironhack/Makefile b/c/libenvironhack/Makefile deleted file mode 100644 index 302ff8e31..000000000 --- a/c/libenvironhack/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -CC=gcc -CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64 - -libenvironhack.dylib: libenvironhack.c - $(CC) $(CCFLAGS) -init _init_environ $< -o $@ - -all: libenvironhack.dylib - -clean: - rm -f libenvironhack.dylib diff --git a/c/libenvironhack/libenvironhack.c b/c/libenvironhack/libenvironhack.c deleted file mode 100644 index 8b2a2640e..000000000 --- a/c/libenvironhack/libenvironhack.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* -LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+. -nm $LSF_LIBDIR/liblsf.dylib | grep environ -See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html -*/ - -#include - -char **environ = (char **)0; - -void init_environ(void) { - environ = (*_NSGetEnviron()); -} diff --git a/c/libenvironhack/libenvironhack.dylib b/c/libenvironhack/libenvironhack.dylib deleted file mode 100755 index a45e038b4..000000000 Binary files a/c/libenvironhack/libenvironhack.dylib and /dev/null differ diff --git a/java/src/org/broadinstitute/sting/alignment/Aligner.java b/java/src/org/broadinstitute/sting/alignment/Aligner.java deleted file mode 100644 index 9ffe5f857..000000000 --- a/java/src/org/broadinstitute/sting/alignment/Aligner.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.alignment; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -import java.util.List; -import java.util.Iterator; - -/** - * Create perfect alignments from the read to the genome represented by the given BWT / suffix array. - * - * @author mhanna - * @version 0.1 - */ -public interface Aligner { - /** - * Close this instance of the BWA pointer and delete its resources. - */ - public void close(); - - /** - * Allow the aligner to choose one alignment randomly from the pile of best alignments. - * @param bases Bases to align. - * @return An align - */ - public Alignment getBestAlignment(final byte[] bases); - - /** - * Align the read to the reference. - * @param read Read to align. - * @param header Optional header to drop in place. - * @return A list of the alignments. - */ - public SAMRecord align(final SAMRecord read, final SAMFileHeader header); - - /** - * Get a iterator of alignments, batched by mapping quality. - * @param bases List of bases. - * @return Iterator to alignments. - */ - public Iterable getAllAlignments(final byte[] bases); - - /** - * Get a iterator of aligned reads, batched by mapping quality. - * @param read Read to align. - * @param newHeader Optional new header to use when aligning the read. If present, it must be null. - * @return Iterator to alignments. - */ - public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader); -} - - diff --git a/java/src/org/broadinstitute/sting/alignment/Alignment.java b/java/src/org/broadinstitute/sting/alignment/Alignment.java deleted file mode 100644 index ebbc8c1b8..000000000 --- a/java/src/org/broadinstitute/sting/alignment/Alignment.java +++ /dev/null @@ -1,221 +0,0 @@ -package org.broadinstitute.sting.alignment; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; - -/** - * Represents an alignment of a read to a site in the reference genome. - * - * @author mhanna - * @version 0.1 - */ -public class Alignment { - protected int contigIndex; - protected long alignmentStart; - protected boolean negativeStrand; - protected int mappingQuality; - - protected char[] cigarOperators; - protected int[] cigarLengths; - - protected int editDistance; - protected String mismatchingPositions; - - protected int numMismatches; - protected int numGapOpens; - protected int numGapExtensions; - protected int bestCount; - protected int secondBestCount; - - /** - * Gets the index of the given contig. - * @return the inde - */ - public int getContigIndex() { return contigIndex; } - - /** - * Gets the starting position for the given alignment. - * @return Starting position. - */ - public long getAlignmentStart() { return alignmentStart; } - - /** - * Is the given alignment on the reverse strand? - * @return True if the alignment is on the reverse strand. - */ - public boolean isNegativeStrand() { return negativeStrand; } - - /** - * Gets the score of this alignment. - * @return The score. - */ - public int getMappingQuality() { return mappingQuality; } - - /** - * Gets the edit distance; will eventually end up in the NM SAM tag - * if this alignment makes it that far. - * @return The edit distance. - */ - public int getEditDistance() { return editDistance; } - - /** - * A string representation of which positions mismatch; contents of MD tag. - * @return String representation of mismatching positions. - */ - public String getMismatchingPositions() { return mismatchingPositions; } - - /** - * Gets the number of mismatches in the read. - * @return Number of mismatches. - */ - public int getNumMismatches() { return numMismatches; } - - /** - * Get the number of gap opens. - * @return Number of gap opens. - */ - public int getNumGapOpens() { return numGapOpens; } - - /** - * Get the number of gap extensions. - * @return Number of gap extensions. - */ - public int getNumGapExtensions() { return numGapExtensions; } - - /** - * Get the number of best alignments. - * @return Number of top scoring alignments. - */ - public int getBestCount() { return bestCount; } - - /** - * Get the number of second best alignments. - * @return Number of second best scoring alignments. - */ - public int getSecondBestCount() { return secondBestCount; } - - /** - * Gets the cigar for this alignment. - * @return sam-jdk formatted alignment. - */ - public Cigar getCigar() { - Cigar cigar = new Cigar(); - for(int i = 0; i < cigarOperators.length; i++) { - CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]); - cigar.add(new CigarElement(cigarLengths[i],operator)); - } - return cigar; - } - - /** - * Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately - * package-protected. - * @return - */ - public String getCigarString() { - Cigar cigar = getCigar(); - if(cigar.isEmpty()) return "*"; - - StringBuilder cigarString = new StringBuilder(); - for(CigarElement element: cigar.getCigarElements()) { - cigarString.append(element.getLength()); - cigarString.append(element.getOperator()); - } - return cigarString.toString(); - } - - /** - * Stub for inheritance. - */ - public Alignment() {} - - /** - * Create a new alignment object. - * @param contigIndex The contig to which this read aligned. - * @param alignmentStart The point within the contig to which this read aligned. - * @param negativeStrand Forward or reverse alignment of the given read. - * @param mappingQuality How good does BWA think this mapping is? - * @param cigarOperators The ordered operators in the cigar string. - * @param cigarLengths The lengths to which each operator applies. - * @param editDistance The edit distance (cumulative) of the read. - * @param mismatchingPositions String representation of which bases in the read mismatch. - * @param numMismatches Number of total mismatches in the read. - * @param numGapOpens Number of gap opens in the read. - * @param numGapExtensions Number of gap extensions in the read. - * @param bestCount Number of best alignments in the read. - * @param secondBestCount Number of second best alignments in the read. - */ - public Alignment(int contigIndex, - int alignmentStart, - boolean negativeStrand, - int mappingQuality, - char[] cigarOperators, - int[] cigarLengths, - int editDistance, - String mismatchingPositions, - int numMismatches, - int numGapOpens, - int numGapExtensions, - int bestCount, - int secondBestCount) { - this.contigIndex = contigIndex; - this.alignmentStart = alignmentStart; - this.negativeStrand = negativeStrand; - this.mappingQuality = mappingQuality; - this.cigarOperators = cigarOperators; - this.cigarLengths = cigarLengths; - this.editDistance = editDistance; - this.mismatchingPositions = mismatchingPositions; - this.numMismatches = numMismatches; - this.numGapOpens = numGapOpens; - this.numGapExtensions = numGapExtensions; - this.bestCount = bestCount; - this.secondBestCount = secondBestCount; - } - - /** - * Creates a read directly from an alignment. - * @param alignment The alignment to convert to a read. - * @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags. - * @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence - * dictionary in the - * @return A mapped alignment. - */ - public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) { - SAMRecord read; - try { - read = (SAMRecord)unmappedRead.clone(); - } - catch(CloneNotSupportedException ex) { - throw new ReviewedStingException("Unable to create aligned read from template."); - } - - if(newSAMHeader != null) - read.setHeader(newSAMHeader); - - // If we're realigning a previously aligned record, strip out the placement of the alignment. - read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - - if(alignment != null) { - read.setReadUnmappedFlag(false); - read.setReferenceIndex(alignment.getContigIndex()); - read.setAlignmentStart((int)alignment.getAlignmentStart()); - read.setReadNegativeStrandFlag(alignment.isNegativeStrand()); - read.setMappingQuality(alignment.getMappingQuality()); - read.setCigar(alignment.getCigar()); - if(alignment.isNegativeStrand()) { - read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases())); - read.setBaseQualities(Utils.reverse(read.getBaseQualities())); - } - read.setAttribute("NM",alignment.getEditDistance()); - read.setAttribute("MD",alignment.getMismatchingPositions()); - } - - return read; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java b/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java deleted file mode 100644 index 16e713bf6..000000000 --- a/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import net.sf.samtools.SAMRecord; - -import java.util.Iterator; - -/** - * Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them - * of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original - * alignment from the input file. - * - * @author mhanna - * @version 0.1 - */ -public class AlignmentValidationWalker extends ReadWalker { - /** - * The supporting BWT index generated using BWT. - */ - @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) - private String prefix = null; - - /** - * The instance used to generate alignments. - */ - private BWACAligner aligner = null; - - /** - * Create an aligner object. The aligner object will load and hold the BWT until close() is called. - */ - @Override - public void initialize() { - if(prefix == null) - prefix = getToolkit().getArguments().referenceFile.getAbsolutePath(); - BWTFiles bwtFiles = new BWTFiles(prefix); - BWAConfiguration configuration = new BWAConfiguration(); - aligner = new BWACAligner(bwtFiles,configuration); - } - - /** - * Aligns a read to the given reference. - * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. - * @param read Read to align. - * @return Number of reads aligned by this map (aka 1). - */ - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - //logger.info(String.format("examining read %s", read.getReadName())); - - byte[] bases = read.getReadBases(); - if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases); - - boolean matches = true; - Iterable alignments = aligner.getAllAlignments(bases); - Iterator alignmentIterator = alignments.iterator(); - - if(!alignmentIterator.hasNext()) { - matches = read.getReadUnmappedFlag(); - } - else { - Alignment[] alignmentsOfBestQuality = alignmentIterator.next(); - for(Alignment alignment: alignmentsOfBestQuality) { - matches = (alignment.getContigIndex() == read.getReferenceIndex()); - matches &= (alignment.getAlignmentStart() == read.getAlignmentStart()); - matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag()); - matches &= (alignment.getCigar().equals(read.getCigar())); - matches &= (alignment.getMappingQuality() == read.getMappingQuality()); - if(matches) break; - } - } - - if(!matches) { - logger.error("Found mismatch!"); - logger.error(String.format("Read %s:",read.getReadName())); - logger.error(String.format(" Contig index: %d",read.getReferenceIndex())); - logger.error(String.format(" Alignment start: %d", read.getAlignmentStart())); - logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag())); - logger.error(String.format(" Cigar: %s%n", read.getCigarString())); - logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality())); - for(Alignment[] alignmentsByScore: alignments) { - for(int i = 0; i < alignmentsByScore.length; i++) { - logger.error(String.format("Alignment %d:",i)); - logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex())); - logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart())); - logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand())); - logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString())); - logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality())); - } - } - throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName())); - } - - return 1; - } - - /** - * Initial value for reduce. In this case, validated reads will be counted. - * @return 0, indicating no reads yet validated. - */ - @Override - public Integer reduceInit() { return 0; } - - /** - * Calculates the number of reads processed. - * @param value Number of reads processed by this map. - * @param sum Number of reads processed before this map. - * @return Number of reads processed up to and including this map. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - /** - * Cleanup. - * @param result Number of reads processed. - */ - @Override - public void onTraversalDone(Integer result) { - aligner.close(); - super.onTraversalDone(result); - } - -} diff --git a/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java deleted file mode 100644 index e97d7a56f..000000000 --- a/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment; - -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import net.sf.samtools.*; -import net.sf.picard.reference.ReferenceSequenceFileFactory; - -import java.io.File; -import java.io.PrintStream; - -/** - * Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format. - * Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation. - * - * @author mhanna - * @version 0.1 - */ -@WalkerName("Align") -public class AlignmentWalker extends ReadWalker { - @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " + - "generated by bwa index -d bwtsw. If unspecified, will default " + - "to the reference specified via the -R argument.",required=false) - private File targetReferenceFile = null; - - @Output - private StingSAMFileWriter out = null; - - /** - * The actual aligner. - */ - private BWACAligner aligner = null; - - /** - * New header to use, if desired. - */ - private SAMFileHeader header; - - /** - * Create an aligner object. The aligner object will load and hold the BWT until close() is called. - */ - @Override - public void initialize() { - if(targetReferenceFile == null) - targetReferenceFile = getToolkit().getArguments().referenceFile; - BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath()); - BWAConfiguration configuration = new BWAConfiguration(); - aligner = new BWACAligner(bwtFiles,configuration); - - // Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted. - header = getToolkit().getSAMFileHeader().clone(); - SAMSequenceDictionary referenceDictionary = - ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary(); - header.setSequenceDictionary(referenceDictionary); - header.setSortOrder(SAMFileHeader.SortOrder.unsorted); - - out.writeHeader(header); - } - - /** - * Aligns a read to the given reference. - * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. - * @param read Read to align. - * @return Number of alignments found for this read. - */ - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - SAMRecord alignedRead = aligner.align(read,header); - out.addAlignment(alignedRead); - return 1; - } - - /** - * Initial value for reduce. In this case, alignments will be counted. - * @return 0, indicating no alignments yet found. - */ - @Override - public Integer reduceInit() { return 0; } - - /** - * Calculates the number of alignments found. - * @param value Number of alignments found by this map. - * @param sum Number of alignments found before this map. - * @return Number of alignments found up to and including this map. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - /** - * Cleanup. - * @param result Number of reads processed. - */ - @Override - public void onTraversalDone(Integer result) { - aligner.close(); - super.onTraversalDone(result); - } - -} diff --git a/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java b/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java deleted file mode 100644 index 1a1e1197d..000000000 --- a/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import net.sf.samtools.SAMRecord; - -import java.util.*; -import java.io.PrintStream; - -/** - * Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the - * frequency of that number of placements. - * - * @author mhanna - * @version 0.1 - */ -public class CountBestAlignmentsWalker extends ReadWalker { - /** - * The supporting BWT index generated using BWT. - */ - @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) - private String prefix = null; - - @Output - private PrintStream out = null; - - /** - * The actual aligner. - */ - private Aligner aligner = null; - - private SortedMap alignmentFrequencies = new TreeMap(); - - /** - * Create an aligner object. The aligner object will load and hold the BWT until close() is called. - */ - @Override - public void initialize() { - if(prefix == null) - prefix = getToolkit().getArguments().referenceFile.getAbsolutePath(); - BWTFiles bwtFiles = new BWTFiles(prefix); - BWAConfiguration configuration = new BWAConfiguration(); - aligner = new BWACAligner(bwtFiles,configuration); - } - - /** - * Aligns a read to the given reference. - * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. - * @param read Read to align. - * @return Number of alignments found for this read. - */ - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator(); - if(alignmentIterator.hasNext()) { - int numAlignments = alignmentIterator.next().length; - if(alignmentFrequencies.containsKey(numAlignments)) - alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1); - else - alignmentFrequencies.put(numAlignments,1); - } - return 1; - } - - /** - * Initial value for reduce. In this case, validated reads will be counted. - * @return 0, indicating no reads yet validated. - */ - @Override - public Integer reduceInit() { return 0; } - - /** - * Calculates the number of reads processed. - * @param value Number of reads processed by this map. - * @param sum Number of reads processed before this map. - * @return Number of reads processed up to and including this map. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - /** - * Cleanup. - * @param result Number of reads processed. - */ - @Override - public void onTraversalDone(Integer result) { - aligner.close(); - for(Map.Entry alignmentFrequency: alignmentFrequencies.entrySet()) - out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue()); - super.onTraversalDone(result); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java deleted file mode 100644 index ddbf784f5..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java +++ /dev/null @@ -1,38 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa; - -import org.broadinstitute.sting.alignment.Aligner; - -/** - * Align reads using BWA. - * - * @author mhanna - * @version 0.1 - */ -public abstract class BWAAligner implements Aligner { - /** - * The supporting files used by BWA. - */ - protected BWTFiles bwtFiles; - - /** - * The current configuration for the BWA aligner. - */ - protected BWAConfiguration configuration; - - /** - * Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct - * parameters. - * @param bwtFiles The many files representing BWTs persisted to disk. - * @param configuration Configuration parameters for the alignment. - */ - public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { - this.bwtFiles = bwtFiles; - this.configuration = configuration; - } - - /** - * Update the configuration passed to the BWA aligner. - * @param configuration New configuration to set. - */ - public abstract void updateConfiguration(BWAConfiguration configuration); -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java deleted file mode 100644 index 73441cb6a..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa; - -/** - * Configuration for the BWA/C aligner. - * - * @author mhanna - * @version 0.1 - */ -public class BWAConfiguration { - /** - * The maximum edit distance used by BWA. - */ - public Float maximumEditDistance = null; - - /** - * How many gap opens are acceptable within this alignment? - */ - public Integer maximumGapOpens = null; - - /** - * How many gap extensions are acceptable within this alignment? - */ - public Integer maximumGapExtensions = null; - - /** - * Do we disallow indels within a certain range from the start / end? - */ - public Integer disallowIndelWithinRange = null; - - /** - * What is the scoring penalty for a mismatch? - */ - public Integer mismatchPenalty = null; - - /** - * What is the scoring penalty for a gap open? - */ - public Integer gapOpenPenalty = null; - - /** - * What is the scoring penalty for a gap extension? - */ - public Integer gapExtensionPenalty = null; -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java deleted file mode 100644 index cd7800900..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java +++ /dev/null @@ -1,240 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.alignment.reference.packing.PackUtils; -import org.broadinstitute.sting.alignment.reference.bwt.BWT; -import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter; -import org.broadinstitute.sting.alignment.reference.bwt.SuffixArray; -import org.broadinstitute.sting.alignment.reference.bwt.SuffixArrayWriter; -import org.broadinstitute.sting.alignment.reference.bwt.ANNWriter; -import org.broadinstitute.sting.alignment.reference.bwt.AMBWriter; - -import java.io.File; -import java.io.IOException; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.util.StringUtil; - -/** - * Support files for BWT. - * - * @author mhanna - * @version 0.1 - */ -public class BWTFiles { - /** - * ANN (?) file name. - */ - public final File annFile; - - /** - * AMB (?) file name. - */ - public final File ambFile; - - /** - * Packed reference sequence file. - */ - public final File pacFile; - - /** - * Reverse of packed reference sequence file. - */ - public final File rpacFile; - - /** - * Forward BWT file. - */ - public final File forwardBWTFile; - - /** - * Forward suffix array file. - */ - public final File forwardSAFile; - - /** - * Reverse BWT file. - */ - public final File reverseBWTFile; - - /** - * Reverse suffix array file. - */ - public final File reverseSAFile; - - /** - * Where these files autogenerated on the fly? - */ - public final boolean autogenerated; - - /** - * Create a new BWA configuration file using the given prefix. - * @param prefix Prefix to use when creating the configuration. Must not be null. - */ - public BWTFiles(String prefix) { - if(prefix == null) - throw new ReviewedStingException("Prefix must not be null."); - annFile = new File(prefix + ".ann"); - ambFile = new File(prefix + ".amb"); - pacFile = new File(prefix + ".pac"); - rpacFile = new File(prefix + ".rpac"); - forwardBWTFile = new File(prefix + ".bwt"); - forwardSAFile = new File(prefix + ".sa"); - reverseBWTFile = new File(prefix + ".rbwt"); - reverseSAFile = new File(prefix + ".rsa"); - autogenerated = false; - } - - /** - * Hand-create a new BWTFiles object, specifying a unique file object for each type. - * @param annFile ANN (alternate dictionary) file. - * @param ambFile AMB (holes) files. - * @param pacFile Packed representation of the forward reference sequence. - * @param forwardBWTFile BWT representation of the forward reference sequence. - * @param forwardSAFile SA representation of the forward reference sequence. - * @param rpacFile Packed representation of the reversed reference sequence. - * @param reverseBWTFile BWT representation of the reversed reference sequence. - * @param reverseSAFile SA representation of the reversed reference sequence. - */ - private BWTFiles(File annFile, - File ambFile, - File pacFile, - File forwardBWTFile, - File forwardSAFile, - File rpacFile, - File reverseBWTFile, - File reverseSAFile) { - this.annFile = annFile; - this.ambFile = ambFile; - this.pacFile = pacFile; - this.forwardBWTFile = forwardBWTFile; - this.forwardSAFile = forwardSAFile; - this.rpacFile = rpacFile; - this.reverseBWTFile = reverseBWTFile; - this.reverseSAFile = reverseSAFile; - autogenerated = true; - } - - /** - * Close out this files object, in the process deleting any temporary filse - * that were created. - */ - public void close() { - if(autogenerated) { - boolean success = true; - success = annFile.delete(); - success &= ambFile.delete(); - success &= pacFile.delete(); - success &= forwardBWTFile.delete(); - success &= forwardSAFile.delete(); - success &= rpacFile.delete(); - success &= reverseBWTFile.delete(); - success &= reverseSAFile.delete(); - - if(!success) - throw new ReviewedStingException("Unable to clean up autogenerated representation"); - } - } - - /** - * Create a new set of BWT files from the given reference sequence. - * @param referenceSequence Sequence from which to build metadata. - * @return A new object representing encoded representations of each sequence. - */ - public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) { - byte[] normalizedReferenceSequence = new byte[referenceSequence.length]; - System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length); - normalizeReferenceSequence(normalizedReferenceSequence); - - File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile; - try { - // Write the ann and amb for this reference sequence. - annFile = File.createTempFile("bwt",".ann"); - ambFile = File.createTempFile("bwt",".amb"); - - SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); - dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length)); - - ANNWriter annWriter = new ANNWriter(annFile); - annWriter.write(dictionary); - annWriter.close(); - - AMBWriter ambWriter = new AMBWriter(ambFile); - ambWriter.writeEmpty(dictionary); - ambWriter.close(); - - // Write the encoded files for the forward version of this reference sequence. - pacFile = File.createTempFile("bwt",".pac"); - bwtFile = File.createTempFile("bwt",".bwt"); - saFile = File.createTempFile("bwt",".sa"); - - writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile); - - // Write the encoded files for the reverse version of this reference sequence. - byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence); - - rpacFile = File.createTempFile("bwt",".rpac"); - rbwtFile = File.createTempFile("bwt",".rbwt"); - rsaFile = File.createTempFile("bwt",".rsa"); - - writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files"); - } - - // Make sure that, at the very least, all temporary files are deleted on exit. - annFile.deleteOnExit(); - ambFile.deleteOnExit(); - pacFile.deleteOnExit(); - bwtFile.deleteOnExit(); - saFile.deleteOnExit(); - rpacFile.deleteOnExit(); - rbwtFile.deleteOnExit(); - rsaFile.deleteOnExit(); - - return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile); - } - - /** - * Write the encoded form of the reference sequence. In the case of BWA, the encoded reference - * sequence is the reference itself in PAC format, the BWT, and the suffix array. - * @param referenceSequence The reference sequence to encode. - * @param pacFile Target for the PAC-encoded reference. - * @param bwtFile Target for the BWT representation of the reference. - * @param suffixArrayFile Target for the suffix array encoding of the reference. - * @throws java.io.IOException In case of issues writing to the file. - */ - private static void writeEncodedReferenceSequence(byte[] referenceSequence, - File pacFile, - File bwtFile, - File suffixArrayFile) throws IOException { - PackUtils.writeReferenceSequence(pacFile,referenceSequence); - - BWT bwt = BWT.createFromReferenceSequence(referenceSequence); - BWTWriter bwtWriter = new BWTWriter(bwtFile); - bwtWriter.write(bwt); - bwtWriter.close(); - - SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); - SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile); - suffixArrayWriter.write(suffixArray); - suffixArrayWriter.close(); - } - - /** - * Convert the given reference sequence into a form suitable for building into - * on-the-fly sequences. - * @param referenceSequence The reference sequence to normalize. - * @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated. - */ - private static void normalizeReferenceSequence(byte[] referenceSequence) { - StringUtil.toUpperCase(referenceSequence); - for(byte base: referenceSequence) { - if(base != 'A' && base != 'C' && base != 'G' && base != 'T') - throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base)); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java deleted file mode 100644 index 8631c42d8..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java +++ /dev/null @@ -1,258 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.c; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import org.broadinstitute.sting.alignment.bwa.BWAAligner; - -import java.util.*; - -/** - * An aligner using the BWA/C implementation. - * - * @author mhanna - * @version 0.1 - */ -public class BWACAligner extends BWAAligner { - static { - System.loadLibrary("bwa"); - } - - /** - * A pointer to the C++ object representing the BWA engine. - */ - private long thunkPointer = 0; - - public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { - super(bwtFiles,configuration); - if(thunkPointer != 0) - throw new ReviewedStingException("BWA/C attempting to reinitialize."); - - if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it."); - if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it."); - - thunkPointer = create(bwtFiles,configuration); - } - - /** - * Create an aligner object using an array of bytes as a reference. - * @param referenceSequence Reference sequence to encode ad-hoc. - * @param configuration Configuration for the given aligner. - */ - public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) { - this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration); - // Now that the temporary files are created, the temporary files can be destroyed. - bwtFiles.close(); - } - - /** - * Update the configuration passed to the BWA aligner. - * @param configuration New configuration to set. - */ - @Override - public void updateConfiguration(BWAConfiguration configuration) { - if(thunkPointer == 0) - throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner."); - updateConfiguration(thunkPointer,configuration); - } - - /** - * Close this instance of the BWA pointer and delete its resources. - */ - @Override - public void close() { - if(thunkPointer == 0) - throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized."); - destroy(thunkPointer); - } - - /** - * Allow the aligner to choose one alignment randomly from the pile of best alignments. - * @param bases Bases to align. - * @return An align - */ - @Override - public Alignment getBestAlignment(final byte[] bases) { - if(thunkPointer == 0) - throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized."); - return getBestAlignment(thunkPointer,bases); - } - - /** - * Get the best aligned read, chosen randomly from the pile of best alignments. - * @param read Read to align. - * @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid. - * @return Read with injected alignment data. - */ - @Override - public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) { - if(bwtFiles.autogenerated) - throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable"); - return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader); - } - - /** - * Get a iterator of alignments, batched by mapping quality. - * @param bases List of bases. - * @return Iterator to alignments. - */ - @Override - public Iterable getAllAlignments(final byte[] bases) { - final BWAPath[] paths = getPaths(bases); - return new Iterable() { - public Iterator iterator() { - return new Iterator() { - /** - * The last position accessed. - */ - private int position = 0; - - /** - * Whether all alignments have been seen based on the current position. - * @return True if any more alignments are pending. False otherwise. - */ - public boolean hasNext() { return position < paths.length; } - - /** - * Return the next cross-section of alignments, based on mapping quality. - * @return Array of the next set of alignments of a given mapping quality. - */ - public Alignment[] next() { - if(position >= paths.length) - throw new UnsupportedOperationException("Out of alignments to return."); - int score = paths[position].score; - int startingPosition = position; - while(position < paths.length && paths[position].score == score) position++; - return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position)); - } - - /** - * Unsupported. - */ - public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); } - }; - } - }; - } - - /** - * Get a iterator of aligned reads, batched by mapping quality. - * @param read Read to align. - * @param newHeader Optional new header to use when aligning the read. If present, it must be null. - * @return Iterator to alignments. - */ - @Override - public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) { - if(bwtFiles.autogenerated) - throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable"); - final Iterable alignments = getAllAlignments(read.getReadBases()); - return new Iterable() { - public Iterator iterator() { - final Iterator alignmentIterator = alignments.iterator(); - return new Iterator() { - /** - * Whether all alignments have been seen based on the current position. - * @return True if any more alignments are pending. False otherwise. - */ - public boolean hasNext() { return alignmentIterator.hasNext(); } - - /** - * Return the next cross-section of alignments, based on mapping quality. - * @return Array of the next set of alignments of a given mapping quality. - */ - public SAMRecord[] next() { - Alignment[] alignmentsOfQuality = alignmentIterator.next(); - SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length]; - for(int i = 0; i < alignmentsOfQuality.length; i++) { - reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader); - } - return reads; - } - - /** - * Unsupported. - */ - public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); } - }; - } - }; - } - - /** - * Get the paths associated with the given base string. - * @param bases List of bases. - * @return A set of paths through the BWA. - */ - public BWAPath[] getPaths(byte[] bases) { - if(thunkPointer == 0) - throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized."); - return getPaths(thunkPointer,bases); - } - - /** - * Create a pointer to the BWA/C thunk. - * @param files BWT source files. - * @param configuration Configuration of the aligner. - * @return Pointer to the BWA/C thunk. - */ - protected native long create(BWTFiles files, BWAConfiguration configuration); - - /** - * Update the configuration passed to the BWA aligner. For internal use only. - * @param thunkPointer pointer to BWA object. - * @param configuration New configuration to set. - */ - protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration); - - /** - * Destroy the BWA/C thunk. - * @param thunkPointer Pointer to the allocated thunk. - */ - protected native void destroy(long thunkPointer); - - /** - * Do the extra steps involved in converting a local alignment to a global alignment. - * @param bases ASCII representation of byte array. - * @param paths Paths through the current BWT. - * @return A list of alignments. - */ - protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) { - if(thunkPointer == 0) - throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized."); - return convertPathsToAlignments(thunkPointer,bases,paths); - } - - /** - * Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead. - * @param thunkPointer pointer to the C++ object managing BWA/C. - * @param bases ASCII representation of byte array. - * @return A list of paths through the specified BWT. - */ - protected native BWAPath[] getPaths(long thunkPointer, byte[] bases); - - /** - * Do the extra steps involved in converting a local alignment to a global alignment. - * Call this method's convertPathsToAlignments() wrapper (above) instead. - * @param thunkPointer pointer to the C++ object managing BWA/C. - * @param bases ASCII representation of byte array. - * @param paths Paths through the current BWT. - * @return A list of alignments. - */ - protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths); - - /** - * Gets the best alignment from BWA/C, randomly selected from all best-aligned reads. - * @param thunkPointer Pointer to BWA thunk. - * @param bases bases to align. - * @return The best alignment from BWA/C. - */ - protected native Alignment getBestAlignment(long thunkPointer, byte[] bases); -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java deleted file mode 100755 index 347d4344f..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment.bwa.c; - -/** - * Models a BWA path. - * - * @author mhanna - * @version 0.1 - */ -public class BWAPath { - /** - * Number of mismatches encountered along this path. - */ - public final int numMismatches; - - /** - * Number of gap opens encountered along this path. - */ - public final int numGapOpens; - - /** - * Number of gap extensions along this path. - */ - public final int numGapExtensions; - - /** - * Whether this alignment was found on the positive or negative strand. - */ - public final boolean negativeStrand; - - /** - * Starting coordinate in the BWT. - */ - public final long k; - - /** - * Ending coordinate in the BWT. - */ - public final long l; - - /** - * The score of this path. - */ - public final int score; - - /** - * The number of best alignments seen along this path. - */ - public final int bestCount; - - /** - * The number of second best alignments seen along this path. - */ - public final int secondBestCount; - - /** - * Create a new path with the given attributes. - * @param numMismatches Number of mismatches along path. - * @param numGapOpens Number of gap opens along path. - * @param numGapExtensions Number of gap extensions along path. - * @param k Index to first coordinate within BWT. - * @param l Index to last coordinate within BWT. - * @param score Score of this alignment. Not the mapping quality. - */ - public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) { - this.numMismatches = numMismatches; - this.numGapOpens = numGapOpens; - this.numGapExtensions = numGapExtensions; - this.negativeStrand = negativeStrand; - this.k = k; - this.l = l; - this.score = score; - this.bestCount = bestCount; - this.secondBestCount = secondBestCount; - } - -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java deleted file mode 100644 index ae6e22221..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java +++ /dev/null @@ -1,165 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -import org.broadinstitute.sting.alignment.Aligner; -import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.io.File; -import java.io.FileNotFoundException; - -import net.sf.samtools.*; -import net.sf.picard.reference.IndexedFastaSequenceFile; - -/** - * A test harness to ensure that the perfect aligner works. - * - * @author mhanna - * @version 0.1 - */ -public class AlignerTestHarness { - public static void main( String argv[] ) throws FileNotFoundException { - if( argv.length != 6 ) { - System.out.println("PerfectAlignerTestHarness "); - System.exit(1); - } - - File referenceFile = new File(argv[0]); - File bwtFile = new File(argv[1]); - File rbwtFile = new File(argv[2]); - File suffixArrayFile = new File(argv[3]); - File reverseSuffixArrayFile = new File(argv[4]); - File bamFile = new File(argv[5]); - - align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile); - } - - private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException { - Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile); - int count = 0; - - SAMFileReader reader = new SAMFileReader(bamFile); - reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - int mismatches = 0; - int failures = 0; - - for(SAMRecord read: reader) { - count++; - if( count > 200000 ) break; - //if( count < 366000 ) continue; - //if( count > 2 ) break; - //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") ) - // continue; - //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") ) - // continue; - //if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") ) - // continue; - - SAMRecord alignmentCleaned = null; - try { - alignmentCleaned = (SAMRecord)read.clone(); - } - catch( CloneNotSupportedException ex ) { - throw new ReviewedStingException("SAMRecord clone not supported", ex); - } - - if( alignmentCleaned.getReadNegativeStrandFlag() ) - alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases())); - - alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); - alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); - - // Clear everything except flags pertaining to pairing and set 'unmapped' status to true. - alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C); - - Iterable alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases()); - if(!alignments.iterator().hasNext() ) { - //throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count)); - System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count); - failures++; - } - - Alignment foundAlignment = null; - for(Alignment[] alignmentsOfQuality: alignments) { - for(Alignment alignment: alignmentsOfQuality) { - if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() ) - continue; - if( read.getAlignmentStart() != alignment.getAlignmentStart() ) - continue; - - foundAlignment = alignment; - } - } - - if( foundAlignment != null ) { - //System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions()); - } - else { - System.out.printf("Error aligning read %s%n", read.getReadName()); - - mismatches++; - - IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); - - System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION), - read.getAlignmentStart(), - read.getReadNegativeStrandFlag()); - int numDeletions = numDeletionsInCigar(read.getCigar()); - String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases()); - System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION)); - - for(Alignment[] alignmentsOfQuality: alignments) { - for(Alignment alignment: alignmentsOfQuality) { - System.out.println(); - - Cigar cigar = ((BWAAlignment)alignment).getCigar(); - - System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION)); - - int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION); - String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases()); - System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION), - alignment.getAlignmentStart(), - alignment.isNegativeStrand()); - } - } - - //throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count)); - } - - - if( count % 1000 == 0 ) - System.out.printf("%d reads examined.%n",count); - } - - System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures); - } - - private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) { - StringBuilder formatted = new StringBuilder(); - int readIndex = 0; - for(CigarElement cigarElement: cigar.getCigarElements()) { - if(cigarElement.getOperator() == toBlank) { - int number = cigarElement.getLength(); - while( number-- > 0 ) formatted.append(' '); - } - else { - int number = cigarElement.getLength(); - while( number-- > 0 ) formatted.append(bases.charAt(readIndex++)); - } - } - return formatted.toString(); - } - - private static int numDeletionsInCigar( Cigar cigar ) { - int numDeletions = 0; - for(CigarElement cigarElement: cigar.getCigarElements()) { - if(cigarElement.getOperator() == CigarOperator.DELETION) - numDeletions += cigarElement.getLength(); - } - return numDeletions; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java deleted file mode 100644 index 879ecb5fb..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java +++ /dev/null @@ -1,151 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Deque; -import java.util.ArrayDeque; -import java.util.Iterator; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; - -/** - * Represents a sequence of matches. - * - * @author mhanna - * @version 0.1 - */ -public class AlignmentMatchSequence implements Cloneable { - /** - * Stores the particular match entries in the order they occur. - */ - private Deque entries = new ArrayDeque(); - - /** - * Clone the given match sequence. - * @return A deep copy of the current match sequence. - */ - public AlignmentMatchSequence clone() { - AlignmentMatchSequence copy = null; - try { - copy = (AlignmentMatchSequence)super.clone(); - } - catch( CloneNotSupportedException ex ) { - throw new ReviewedStingException("Unable to clone AlignmentMatchSequence."); - } - - copy.entries = new ArrayDeque(); - for( AlignmentMatchSequenceEntry entry: entries ) - copy.entries.add(entry.clone()); - - return copy; - } - - public Cigar convertToCigar(boolean negativeStrand) { - Cigar cigar = new Cigar(); - Iterator iterator = negativeStrand ? entries.descendingIterator() : entries.iterator(); - while( iterator.hasNext() ) { - AlignmentMatchSequenceEntry entry = iterator.next(); - CigarOperator operator; - switch( entry.getAlignmentState() ) { - case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break; - case INSERTION: operator = CigarOperator.INSERTION; break; - case DELETION: operator = CigarOperator.DELETION; break; - default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState()); - } - cigar.add( new CigarElement(entry.count,operator) ); - } - return cigar; - } - - /** - * All a new alignment of the given state. - * @param state State to add to the sequence. - */ - public void addNext( AlignmentState state ) { - AlignmentMatchSequenceEntry last = entries.peekLast(); - // If the last entry is the same as this one, increment it. Otherwise, add a new entry. - if( last != null && last.alignmentState == state ) - last.increment(); - else - entries.add(new AlignmentMatchSequenceEntry(state)); - } - - /** - * Gets the current state of this alignment (what's the state of the last base?) - * @return State of the most recently aligned base. - */ - public AlignmentState getCurrentState() { - if( entries.size() == 0 ) - return AlignmentState.MATCH_MISMATCH; - return entries.peekLast().getAlignmentState(); - } - - /** - * How many bases in the read match the given state. - * @param state State to test. - * @return number of bases which match that state. - */ - public int getNumberOfBasesMatchingState(AlignmentState state) { - int matches = 0; - for( AlignmentMatchSequenceEntry entry: entries ) { - if( entry.getAlignmentState() == state ) - matches += entry.count; - } - return matches; - } - - /** - * Stores an individual match sequence entry. - */ - private class AlignmentMatchSequenceEntry implements Cloneable { - /** - * The state of the alignment throughout a given point in the sequence. - */ - private final AlignmentState alignmentState; - - /** - * The number of bases having this particular state. - */ - private int count; - - /** - * Create a new sequence entry with the given state. - * @param alignmentState The state that this sequence should contain. - */ - AlignmentMatchSequenceEntry( AlignmentState alignmentState ) { - this.alignmentState = alignmentState; - this.count = 1; - } - - /** - * Clone the given match sequence entry. - * @return A deep copy of the current match sequence entry. - */ - public AlignmentMatchSequenceEntry clone() { - try { - return (AlignmentMatchSequenceEntry)super.clone(); - } - catch( CloneNotSupportedException ex ) { - throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry."); - } - } - - /** - * Retrieves the current state of the alignment. - * @return The state of the current sequence. - */ - AlignmentState getAlignmentState() { - return alignmentState; - } - - /** - * Increment the count of alignments having this particular state. - */ - void increment() { - count++; - } - } -} - diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java deleted file mode 100644 index 92c603335..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java +++ /dev/null @@ -1,13 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -/** - * The state of a given base in the alignment. - * - * @author mhanna - * @version 0.1 - */ -public enum AlignmentState { - MATCH_MISMATCH, - INSERTION, - DELETION -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java deleted file mode 100644 index c59546bbb..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java +++ /dev/null @@ -1,190 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import net.sf.samtools.Cigar; - -/** - * An alignment object to be used incrementally as the BWA aligner - * inspects the read. - * - * @author mhanna - * @version 0.1 - */ -public class BWAAlignment extends Alignment implements Cloneable { - /** - * Track the number of alignments that have been created. - */ - private static long numCreated; - - /** - * Which number alignment is this? - */ - private long creationNumber; - - /** - * The aligner performing the alignments. - */ - protected BWAJavaAligner aligner; - - /** - * The sequence of matches/mismatches/insertions/deletions. - */ - private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence(); - - /** - * Working variable. How many bases have been matched at this point. - */ - protected int position; - - /** - * Working variable. How many mismatches have been encountered at this point. - */ - private int mismatches; - - /** - * Number of gap opens in alignment. - */ - private int gapOpens; - - /** - * Number of gap extensions in alignment. - */ - private int gapExtensions; - - /** - * Working variable. The lower bound of the alignment within the BWT. - */ - protected long loBound; - - /** - * Working variable. The upper bound of the alignment within the BWT. - */ - protected long hiBound; - - protected void setAlignmentStart(long position) { - this.alignmentStart = position; - } - - protected void setNegativeStrand(boolean negativeStrand) { - this.negativeStrand = negativeStrand; - } - - /** - * Cache the score. - */ - private int score; - - public Cigar getCigar() { - return alignmentMatchSequence.convertToCigar(isNegativeStrand()); - } - - /** - * Gets the current state of this alignment (state of the last base viewed).. - * @return Current state of the alignment. - */ - public AlignmentState getCurrentState() { - return alignmentMatchSequence.getCurrentState(); - } - - /** - * Adds the given state to the current alignment. - * @param state State to add to the given alignment. - */ - public void addState( AlignmentState state ) { - alignmentMatchSequence.addNext(state); - } - - /** - * Gets the BWA score of this alignment. - * @return BWA-style scores. 0 is best. - */ - public int getScore() { - return score; - } - - public int getMismatches() { return mismatches; } - public int getGapOpens() { return gapOpens; } - public int getGapExtensions() { return gapExtensions; } - - public void incrementMismatches() { - this.mismatches++; - updateScore(); - } - - public void incrementGapOpens() { - this.gapOpens++; - updateScore(); - } - - public void incrementGapExtensions() { - this.gapExtensions++; - updateScore(); - } - - /** - * Updates the score based on new information about matches / mismatches. - */ - private void updateScore() { - score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY; - } - - /** - * Create a new alignment with the given parent aligner. - * @param aligner Aligner being used. - */ - public BWAAlignment( BWAJavaAligner aligner ) { - this.aligner = aligner; - this.creationNumber = numCreated++; - } - - /** - * Clone the alignment. - * @return New instance of the alignment. - */ - public BWAAlignment clone() { - BWAAlignment newAlignment = null; - try { - newAlignment = (BWAAlignment)super.clone(); - } - catch( CloneNotSupportedException ex ) { - throw new ReviewedStingException("Unable to clone BWAAlignment."); - } - newAlignment.creationNumber = numCreated++; - newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone(); - - return newAlignment; - } - - /** - * How many bases in the read match the given state. - * @param state State to test. - * @return number of bases which match that state. - */ - public int getNumberOfBasesMatchingState(AlignmentState state) { - return alignmentMatchSequence.getNumberOfBasesMatchingState(state); - } - - /** - * Compare this alignment to another alignment. - * @param rhs Other alignment to which to compare. - * @return < 0 if this < other, == 0 if this == other, > 0 if this > other - */ - public int compareTo(Alignment rhs) { - BWAAlignment other = (BWAAlignment)rhs; - - // If the scores are different, disambiguate using the score. - if(score != other.score) - return score > other.score ? 1 : -1; - - // Otherwise, use the order in which the elements were created. - if(creationNumber != other.creationNumber) - return creationNumber > other.creationNumber ? -1 : 1; - - return 0; - } - - public String toString() { - return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java deleted file mode 100644 index 81186c53e..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java +++ /dev/null @@ -1,392 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -import org.broadinstitute.sting.alignment.reference.bwt.*; -import org.broadinstitute.sting.alignment.bwa.BWAAligner; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; - -import java.io.File; -import java.util.*; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -/** - * Create imperfect alignments from the read to the genome represented by the given BWT / suffix array. - * - * @author mhanna - * @version 0.1 - */ -public class BWAJavaAligner extends BWAAligner { - /** - * BWT in the forward direction. - */ - private BWT forwardBWT; - - /** - * BWT in the reverse direction. - */ - private BWT reverseBWT; - - /** - * Suffix array in the forward direction. - */ - private SuffixArray forwardSuffixArray; - - /** - * Suffix array in the reverse direction. - */ - private SuffixArray reverseSuffixArray; - - /** - * Maximum edit distance (-n option from original BWA). - */ - private final int MAXIMUM_EDIT_DISTANCE = 4; - - /** - * Maximum number of gap opens (-o option from original BWA). - */ - private final int MAXIMUM_GAP_OPENS = 1; - - /** - * Maximum number of gap extensions (-e option from original BWA). - */ - private final int MAXIMUM_GAP_EXTENSIONS = 6; - - /** - * Penalty for straight mismatches (-M option from original BWA). - */ - public final int MISMATCH_PENALTY = 3; - - /** - * Penalty for gap opens (-O option from original BWA). - */ - public final int GAP_OPEN_PENALTY = 11; - - /** - * Penalty for gap extensions (-E option from original BWA). - */ - public final int GAP_EXTENSION_PENALTY = 4; - - /** - * Skip the ends of indels. - */ - public final int INDEL_END_SKIP = 5; - - public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) { - super(null,null); - forwardBWT = new BWTReader(forwardBWTFile).read(); - reverseBWT = new BWTReader(reverseBWTFile).read(); - forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read(); - reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read(); - } - - /** - * Close this instance of the BWA pointer and delete its resources. - */ - @Override - public void close() { - throw new UnsupportedOperationException("BWA aligner can't currently be closed."); - } - - /** - * Update the current parameters of this aligner. - * @param configuration New configuration to set. - */ - public void updateConfiguration(BWAConfiguration configuration) { - throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed."); - } - - /** - * Allow the aligner to choose one alignment randomly from the pile of best alignments. - * @param bases Bases to align. - * @return An align - */ - public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } - - /** - * Align the read to the reference. - * @param read Read to align. - * @param header Optional header to drop in place. - * @return A list of the alignments. - */ - public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } - - /** - * Get a iterator of alignments, batched by mapping quality. - * @param bases List of bases. - * @return Iterator to alignments. - */ - public Iterable getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } - - /** - * Get a iterator of aligned reads, batched by mapping quality. - * @param read Read to align. - * @param newHeader Optional new header to use when aligning the read. If present, it must be null. - * @return Iterator to alignments. - */ - public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } - - - public List align( SAMRecord read ) { - List successfulMatches = new ArrayList(); - - Byte[] uncomplementedBases = normalizeBases(read.getReadBases()); - Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases()))); - - List forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT); - List reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT); - - // Seed the best score with any score that won't overflow on comparison. - int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY; - int bestDiff = MAXIMUM_EDIT_DISTANCE+1; - int maxDiff = MAXIMUM_EDIT_DISTANCE; - - PriorityQueue alignments = new PriorityQueue(); - - // Create a fictional initial alignment, with the position just off the end of the read, and the limits - // set as the entire BWT. - alignments.add(createSeedAlignment(reverseBWT)); - alignments.add(createSeedAlignment(forwardBWT)); - - while(!alignments.isEmpty()) { - BWAAlignment alignment = alignments.remove(); - - // From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on. - if( alignment.getScore() > bestScore + MISMATCH_PENALTY ) - break; - - Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases; - BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT; - List lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds; - - // if z < D(i) then return {} - int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions(); - if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value ) - continue; - - if(mismatches == 0) { - exactMatch(alignment,bases,bwt); - if(alignment.loBound > alignment.hiBound) - continue; - } - - // Found a valid alignment; store it and move on. - if(alignment.position >= read.getReadLength()-1) { - for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) { - BWAAlignment finalAlignment = alignment.clone(); - - if( finalAlignment.isNegativeStrand() ) - finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1); - else { - int sizeAlongReference = read.getReadLength() - - finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) + - finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION); - finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1); - } - - successfulMatches.add(finalAlignment); - - bestScore = Math.min(finalAlignment.getScore(),bestScore); - bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff); - maxDiff = bestDiff + 1; - } - - continue; - } - - //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : ""); - /* - System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(), - alignment.negativeStrand?1:0, - bases.length-alignment.position-1, - alignment.getCurrentState().toString().charAt(0), - alignment.getMismatches(), - alignment.getGapOpens(), - alignment.getGapExtensions(), - lowerBounds.get(alignment.position+1).value, - lowerBounds.get(alignment.position+1).width, - alignment.loBound, - alignment.hiBound); - */ - - // Temporary -- look ahead to see if the next alignment is bounded. - boolean allowDifferences = mismatches > 0; - boolean allowMismatches = mismatches > 0; - - if( allowDifferences && - alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() && - read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) { - if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) { - if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) { - // Add a potential insertion extension. - BWAAlignment insertionAlignment = createInsertionAlignment(alignment); - insertionAlignment.incrementGapOpens(); - alignments.add(insertionAlignment); - - // Add a potential deletion by marking a deletion and augmenting the position. - List deletionAlignments = createDeletionAlignments(bwt,alignment); - for( BWAAlignment deletionAlignment: deletionAlignments ) - deletionAlignment.incrementGapOpens(); - alignments.addAll(deletionAlignments); - } - } - else if( alignment.getCurrentState() == AlignmentState.INSERTION ) { - if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) { - // Add a potential insertion extension. - BWAAlignment insertionAlignment = createInsertionAlignment(alignment); - insertionAlignment.incrementGapExtensions(); - alignments.add(insertionAlignment); - } - } - else if( alignment.getCurrentState() == AlignmentState.DELETION ) { - if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) { - // Add a potential deletion by marking a deletion and augmenting the position. - List deletionAlignments = createDeletionAlignments(bwt,alignment); - for( BWAAlignment deletionAlignment: deletionAlignments ) - deletionAlignment.incrementGapExtensions(); - alignments.addAll(deletionAlignments); - } - } - } - - // Mismatches - alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches)); - } - - return successfulMatches; - } - - /** - * Create an seeding alignment to use as a starting point when traversing. - * @param bwt source BWT. - * @return Seed alignment. - */ - private BWAAlignment createSeedAlignment(BWT bwt) { - BWAAlignment seed = new BWAAlignment(this); - seed.setNegativeStrand(bwt == forwardBWT); - seed.position = -1; - seed.loBound = 0; - seed.hiBound = bwt.length(); - return seed; - } - - /** - * Creates a new alignments representing direct matches / mismatches. - * @param bwt Source BWT with which to work. - * @param alignment Alignment for the previous position. - * @param bases The bases in the read. - * @param allowMismatch Should mismatching bases be allowed? - * @return New alignment representing this position if valid; null otherwise. - */ - private List createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) { - List newAlignments = new ArrayList(); - - List baseChoices = new ArrayList(); - Byte thisBase = bases[alignment.position+1]; - - if( allowMismatch ) - baseChoices.addAll(Bases.allOf()); - else - baseChoices.add(thisBase); - - if( thisBase != null ) { - // Keep rotating the current base to the last position until we've hit the current base. - for( ;; ) { - baseChoices.add(baseChoices.remove(0)); - if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) ) - break; - - } - } - - for(byte base: baseChoices) { - BWAAlignment newAlignment = alignment.clone(); - - newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; - newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); - - // If this alignment is valid, skip it. - if( newAlignment.loBound > newAlignment.hiBound ) - continue; - - newAlignment.position++; - newAlignment.addState(AlignmentState.MATCH_MISMATCH); - if( bases[newAlignment.position] == null || base != bases[newAlignment.position] ) - newAlignment.incrementMismatches(); - - newAlignments.add(newAlignment); - } - - return newAlignments; - } - - /** - * Create a new alignment representing an insertion at this point in the read. - * @param alignment Alignment from which to derive the insertion. - * @return New alignment reflecting the insertion. - */ - private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) { - // Add a potential insertion extension. - BWAAlignment newAlignment = alignment.clone(); - newAlignment.position++; - newAlignment.addState(AlignmentState.INSERTION); - return newAlignment; - } - - /** - * Create new alignments representing a deletion at this point in the read. - * @param bwt source BWT for inferring deletion info. - * @param alignment Alignment from which to derive the deletion. - * @return New alignments reflecting all possible deletions. - */ - private List createDeletionAlignments( BWT bwt, BWAAlignment alignment) { - List newAlignments = new ArrayList(); - for(byte base: Bases.instance) { - BWAAlignment newAlignment = alignment.clone(); - - newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; - newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); - - // If this alignment is valid, skip it. - if( newAlignment.loBound > newAlignment.hiBound ) - continue; - - newAlignment.addState(AlignmentState.DELETION); - - newAlignments.add(newAlignment); - } - - return newAlignments; - } - - /** - * Exactly match the given alignment against the given BWT. - * @param alignment Alignment to match. - * @param bases Bases to use. - * @param bwt BWT to use. - */ - private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) { - while( ++alignment.position < bases.length ) { - byte base = bases[alignment.position]; - alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; - alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); - if( alignment.loBound > alignment.hiBound ) - return; - } - } - - /** - * Make each base into A/C/G/T or null if unknown. - * @param bases Base string to normalize. - * @return Array of normalized bases. - */ - private Byte[] normalizeBases( byte[] bases ) { - Byte[] normalBases = new Byte[bases.length]; - for(int i = 0; i < bases.length; i++) - normalBases[i] = Bases.fromASCII(bases[i]); - return normalBases; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java deleted file mode 100644 index 3784643c0..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.java; - -import java.util.List; -import java.util.ArrayList; - -import org.broadinstitute.sting.alignment.reference.bwt.BWT; - -/** - * At any point along the given read, what is a good lower bound for the - * total number of differences? - * - * @author mhanna - * @version 0.1 - */ -public class LowerBound { - /** - * Lower bound of the suffix array. - */ - public final long loIndex; - - /** - * Upper bound of the suffix array. - */ - public final long hiIndex; - - /** - * Width of the bwt from loIndex -> hiIndex, inclusive. - */ - public final long width; - - /** - * The lower bound at the given point. - */ - public final int value; - - /** - * Create a new lower bound with the given value. - * @param loIndex The lower bound of the BWT. - * @param hiIndex The upper bound of the BWT. - * @param value Value for the lower bound at this site. - */ - private LowerBound(long loIndex, long hiIndex, int value) { - this.loIndex = loIndex; - this.hiIndex = hiIndex; - this.width = hiIndex - loIndex + 1; - this.value = value; - } - - /** - * Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper. - * @param bases Bases of the read to use when creating a new BWT. - * @param bwt BWT to check against. - * @return A list of lower bounds at every point in the reference. - * - */ - public static List create(Byte[] bases, BWT bwt) { - List bounds = new ArrayList(); - - long loIndex = 0, hiIndex = bwt.length(); - int mismatches = 0; - for( int i = bases.length-1; i >= 0; i-- ) { - Byte base = bases[i]; - - // Ignore non-ACGT bases. - if( base != null ) { - loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1; - hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex); - } - - if( base == null || loIndex > hiIndex ) { - loIndex = 0; - hiIndex = bwt.length(); - mismatches++; - } - bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches)); - } - - return bounds; - } - - /** - * Create a string representation of this bound. - * @return String version of this bound. - */ - public String toString() { - return String.format("LowerBound: w = %d, value = %d",width,value); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/package-info.java b/java/src/org/broadinstitute/sting/alignment/package-info.java deleted file mode 100644 index 60cf1e425..000000000 --- a/java/src/org/broadinstitute/sting/alignment/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - * Analyses used to validate the correctness and performance the BWA Java bindings. - */ -package org.broadinstitute.sting.alignment; \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java deleted file mode 100644 index 1d97fec79..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; - -import java.io.PrintStream; -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; - -/** - * Writes .amb files - a file indicating where 'holes' (indeterminant bases) - * exist in the contig. Currently, only empty, placeholder AMBs are supported. - * - * @author mhanna - * @version 0.1 - */ -public class AMBWriter { - /** - * Number of holes is fixed at zero. - */ - private static final int NUM_HOLES = 0; - - /** - * Input stream from which to read BWT data. - */ - private final PrintStream out; - - /** - * Create a new ANNWriter targeting the given file. - * @param file file into which ANN data should be written. - * @throws java.io.IOException if there is a problem opening the output file. - */ - public AMBWriter(File file) throws IOException { - out = new PrintStream(file); - } - - /** - * Create a new ANNWriter targeting the given OutputStream. - * @param stream Stream into which ANN data should be written. - */ - public AMBWriter(OutputStream stream) { - out = new PrintStream(stream); - } - - /** - * Write the contents of the given dictionary into the AMB file. - * Assumes that there are no holes in the dictionary. - * @param dictionary Dictionary to write. - */ - public void writeEmpty(SAMSequenceDictionary dictionary) { - long genomeLength = 0L; - for(SAMSequenceRecord sequence: dictionary.getSequences()) - genomeLength += sequence.getSequenceLength(); - - int sequences = dictionary.getSequences().size(); - - // Write the header - out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES); - } - - /** - * Close the given output stream. - */ - public void close() { - out.close(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java deleted file mode 100644 index 17296c31c..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java +++ /dev/null @@ -1,95 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; - -import java.io.PrintStream; -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; - -/** - * Writes .ann files - an alternate sequence dictionary format - * used by BWA/C. For best results, the input sequence dictionary - * should be created with Picard's CreateSequenceDictionary.jar, - * TRUNCATE_NAMES_AT_WHITESPACE=false. - * - * @author mhanna - * @version 0.1 - */ -public class ANNWriter { - /** - * BWA uses a fixed seed of 11, written into every file. - */ - private static final int BNS_SEED = 11; - - /** - * A seemingly unused value that appears in every contig in the ANN. - */ - private static final int GI = 0; - - /** - * Input stream from which to read BWT data. - */ - private final PrintStream out; - - /** - * Create a new ANNWriter targeting the given file. - * @param file file into which ANN data should be written. - * @throws IOException if there is a problem opening the output file. - */ - public ANNWriter(File file) throws IOException { - out = new PrintStream(file); - } - - /** - * Create a new ANNWriter targeting the given OutputStream. - * @param stream Stream into which ANN data should be written. - */ - public ANNWriter(OutputStream stream) { - out = new PrintStream(stream); - } - - /** - * Write the contents of the given dictionary into the ANN file. - * Assumes that no ambs (blocks of indeterminate base) are present in the dictionary. - * @param dictionary Dictionary to write. - */ - public void write(SAMSequenceDictionary dictionary) { - long genomeLength = 0L; - for(SAMSequenceRecord sequence: dictionary.getSequences()) - genomeLength += sequence.getSequenceLength(); - - int sequences = dictionary.getSequences().size(); - - // Write the header - out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED); - - for(SAMSequenceRecord sequence: dictionary.getSequences()) { - String fullSequenceName = sequence.getSequenceName(); - String trimmedSequenceName = fullSequenceName; - String sequenceComment = "(null)"; - - long offset = 0; - - // Separate the sequence name from the sequence comment, based on BWA's definition. - // BWA's definition appears to accept a zero-length contig name, so mimic that behavior. - if(fullSequenceName.indexOf(' ') >= 0) { - trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' ')); - sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1); - } - - // Write the sequence GI (?), name, and comment. - out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment); - // Write the sequence offset, length, and ambs (currently fixed at 0). - out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0); - } - } - - /** - * Close the given output stream. - */ - public void close() { - out.close(); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java deleted file mode 100644 index 7f8c48253..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java +++ /dev/null @@ -1,172 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.alignment.reference.packing.PackUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Represents the Burrows-Wheeler Transform of a reference sequence. - * - * @author mhanna - * @version 0.1 - */ -public class BWT { - /** - * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases. - * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0 - */ - public static final int SEQUENCE_BLOCK_SIZE = 128; - - /** - * The inverse SA, used as a placeholder for determining where the special EOL character sits. - */ - protected final long inverseSA0; - - /** - * Cumulative counts for the entire BWT. - */ - protected final Counts counts; - - /** - * The individual sequence blocks, modelling how they appear on disk. - */ - protected final SequenceBlock[] sequenceBlocks; - - /** - * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII). - * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. - * @param counts Cumulative count of bases, in A,C,G,T order. - * @param sequenceBlocks The full BWT sequence, sans the '$'. - */ - public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) { - this.inverseSA0 = inverseSA0; - this.counts = counts; - this.sequenceBlocks = sequenceBlocks; - } - - /** - * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII). - * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. - * @param counts Count of bases, in A,C,G,T order. - * @param sequence The full BWT sequence, sans the '$'. - */ - public BWT( long inverseSA0, Counts counts, byte[] sequence ) { - this(inverseSA0,counts,generateSequenceBlocks(sequence)); - } - - /** - * Extract the full sequence from the list of block. - * @return The full BWT string as a byte array. - */ - public byte[] getSequence() { - byte[] sequence = new byte[(int)counts.getTotal()]; - for( SequenceBlock block: sequenceBlocks ) - System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength); - return sequence; - } - - /** - * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. - * @param base The base. - * @return Total counts for all bases lexicographically smaller than this base. - */ - public long counts(byte base) { - return counts.getCumulative(base); - } - - /** - * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. - * @param base The base. - * @param index The position to search within the BWT. - * @return Total counts for all bases lexicographically smaller than this base. - */ - public long occurrences(byte base,long index) { - SequenceBlock block = getSequenceBlock(index); - int position = getSequencePosition(index); - long accumulator = block.occurrences.get(base); - for(int i = 0; i <= position; i++) { - if(base == block.sequence[i]) - accumulator++; - } - return accumulator; - } - - /** - * The number of bases in the BWT as a whole. - * @return Number of bases. - */ - public long length() { - return counts.getTotal(); - } - - /** - * Create a new BWT from the given reference sequence. - * @param referenceSequence Sequence from which to derive the BWT. - * @return reference sequence-derived BWT. - */ - public static BWT createFromReferenceSequence(byte[] referenceSequence) { - SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); - - byte[] bwt = new byte[(int)suffixArray.length()-1]; - int bwtIndex = 0; - for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) { - if(suffixArray.get(suffixArrayIndex) == 0) - continue; - bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1]; - } - - return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt); - } - - /** - * Gets the base at a given position in the BWT. - * @param index The index to use. - * @return The base at that location. - */ - protected byte getBase(long index) { - if(index == inverseSA0) - throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index)); - - SequenceBlock block = getSequenceBlock(index); - int position = getSequencePosition(index); - return block.sequence[position]; - } - - private SequenceBlock getSequenceBlock(long index) { - // If the index is above the SA-1[0], remap it to the appropriate coordinate space. - if(index > inverseSA0) index--; - return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)]; - } - - private int getSequencePosition(long index) { - // If the index is above the SA-1[0], remap it to the appropriate coordinate space. - if(index > inverseSA0) index--; - return (int)(index%SEQUENCE_BLOCK_SIZE); - } - - /** - * Create a set of sequence blocks from one long sequence. - * @param sequence Sequence from which to derive blocks. - * @return Array of sequence blocks containing data from the sequence. - */ - private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) { - Counts occurrences = new Counts(); - - int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE); - SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks]; - - for( int block = 0; block < numSequenceBlocks; block++ ) { - int blockStart = block*SEQUENCE_BLOCK_SIZE; - int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart); - byte[] subsequence = new byte[blockLength]; - - System.arraycopy(sequence,blockStart,subsequence,0,blockLength); - - sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence); - - for( byte base: subsequence ) - occurrences.increment(base); - } - - return sequenceBlocks; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java deleted file mode 100644 index 64a595419..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java +++ /dev/null @@ -1,86 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream; -import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream; -import org.broadinstitute.sting.alignment.reference.packing.PackUtils; - -import java.io.*; -import java.nio.ByteOrder; -/** - * Reads a BWT from a given file. - * - * @author mhanna - * @version 0.1 - */ -public class BWTReader { - /** - * Input stream from which to read BWT data. - */ - private FileInputStream inputStream; - - /** - * Create a new BWT reader. - * @param inputFile File in which the BWT is stored. - */ - public BWTReader( File inputFile ) { - try { - this.inputStream = new FileInputStream(inputFile); - } - catch( FileNotFoundException ex ) { - throw new ReviewedStingException("Unable to open input file", ex); - } - } - - /** - * Read a BWT from the input stream. - * @return The BWT stored in the input stream. - */ - public BWT read() { - UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); - BasePackedInputStream basePackedInputStream = new BasePackedInputStream(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN); - - long inverseSA0; - long[] count; - SequenceBlock[] sequenceBlocks; - - try { - inverseSA0 = uintPackedInputStream.read(); - count = new long[PackUtils.ALPHABET_SIZE]; - uintPackedInputStream.read(count); - - long bwtSize = count[PackUtils.ALPHABET_SIZE-1]; - sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)]; - - for( int block = 0; block < sequenceBlocks.length; block++ ) { - int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE; - int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart); - - long[] occurrences = new long[PackUtils.ALPHABET_SIZE]; - byte[] bwt = new byte[sequenceLength]; - - uintPackedInputStream.read(occurrences); - basePackedInputStream.read(bwt); - - sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt); - } - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to read BWT from input stream.", ex); - } - - return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks); - } - - /** - * Close the input stream. - */ - public void close() { - try { - inputStream.close(); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to close input file", ex); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java deleted file mode 100644 index f24baf766..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; - -import java.io.File; -import java.io.IOException; - -/** - * Generate BWA supplementary files (.ann, .amb) from the command line. - * - * @author mhanna - * @version 0.1 - */ -public class BWTSupplementaryFileGenerator { - enum SupplementaryFileType { ANN, AMB } - - public static void main(String[] args) throws IOException { - if(args.length < 3) - usage("Incorrect number of arguments supplied"); - - File fastaFile = new File(args[0]); - File outputFile = new File(args[1]); - SupplementaryFileType outputType = null; - try { - outputType = Enum.valueOf(SupplementaryFileType.class,args[2]); - } - catch(IllegalArgumentException ex) { - usage("Invalid output type: " + args[2]); - } - - ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile); - SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary(); - - switch(outputType) { - case ANN: - ANNWriter annWriter = new ANNWriter(outputFile); - annWriter.write(dictionary); - annWriter.close(); - break; - case AMB: - AMBWriter ambWriter = new AMBWriter(outputFile); - ambWriter.writeEmpty(dictionary); - ambWriter.close(); - break; - default: - usage("Unsupported output type: " + outputType); - } - } - - /** - * Print usage information and exit. - */ - private static void usage(String message) { - System.err.println(message); - System.err.println("Usage: BWTSupplementaryFileGenerator "); - System.exit(1); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java deleted file mode 100644 index b3867ebfe..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream; -import org.broadinstitute.sting.alignment.reference.packing.BasePackedOutputStream; - -import java.io.*; -import java.nio.ByteOrder; - -/** - * Writes an in-memory BWT to an outputstream. - * - * @author mhanna - * @version 0.1 - */ -public class BWTWriter { - /** - * Input stream from which to read BWT data. - */ - private final OutputStream outputStream; - - /** - * Create a new BWT writer. - * @param outputFile File in which the BWT is stored. - */ - public BWTWriter( File outputFile ) { - try { - this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); - } - catch( FileNotFoundException ex ) { - throw new ReviewedStingException("Unable to open output file", ex); - } - } - - /** - * Write a BWT to the output stream. - * @param bwt Transform to be written to the output stream. - */ - public void write( BWT bwt ) { - UnsignedIntPackedOutputStream intPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); - BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN); - - try { - intPackedOutputStream.write(bwt.inverseSA0); - intPackedOutputStream.write(bwt.counts.toArray(true)); - - for( SequenceBlock block: bwt.sequenceBlocks ) { - intPackedOutputStream.write(block.occurrences.toArray(false)); - basePackedOutputStream.write(block.sequence); - } - - // The last block is the last set of counts in the structure. - intPackedOutputStream.write(bwt.counts.toArray(false)); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to read BWT from input stream.", ex); - } - } - - /** - * Close the input stream. - */ - public void close() { - try { - outputStream.close(); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to close input file", ex); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java deleted file mode 100644 index bc0a5b63d..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java +++ /dev/null @@ -1,108 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Enhanced enum representation of a base. - * - * @author mhanna - * @version 0.1 - */ -public class Bases implements Iterable -{ - public static byte A = 'A'; - public static byte C = 'C'; - public static byte G = 'G'; - public static byte T = 'T'; - - public static final Bases instance = new Bases(); - - private static final List allBases; - - /** - * Representation of the base broken down by packed value. - */ - private static final Map basesByPack = new HashMap(); - - static { - List bases = new ArrayList(); - bases.add(A); - bases.add(C); - bases.add(G); - bases.add(T); - allBases = Collections.unmodifiableList(bases); - - for(int i = 0; i < allBases.size(); i++) - basesByPack.put(i,allBases.get(i)); - } - - /** - * Create a new base with the given ascii representation and - * pack value. - */ - private Bases() { - } - - /** - * Return all possible bases. - * @return Byte representation of all bases. - */ - public static Collection allOf() { - return allBases; - } - - /** - * Gets the number of known bases. - * @return The number of known bases. - */ - public static int size() { - return allBases.size(); - } - - /** - * Gets an iterator over the total number of known base types. - * @return Iterator over all known bases. - */ - public Iterator iterator() { - return basesByPack.values().iterator(); - } - - /** - * Get the given base from the packed representation. - * @param pack Packed representation. - * @return base. - */ - public static byte fromPack( int pack ) { return basesByPack.get(pack); } - - /** - * Convert the given base to its packed value. - * @param ascii ASCII representation of the base. - * @return Packed value. - */ - public static int toPack( byte ascii ) - { - for( Map.Entry entry: basesByPack.entrySet() ) { - if( entry.getValue().equals(ascii) ) - return entry.getKey(); - } - throw new ReviewedStingException(String.format("Base %c is an invalid base to pack", (char)ascii)); - } - - /** - * Convert the ASCII representation of a base to its 'normalized' representation. - * @param base The base itself. - * @return The byte, if present. Null if unknown. - */ - public static Byte fromASCII( byte base ) { - Byte found = null; - for( Byte normalized: allBases ) { - if( normalized.equals(base) ) { - found = normalized; - break; - } - } - return found; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java deleted file mode 100644 index 268b11ac4..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java +++ /dev/null @@ -1,151 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.HashMap; -import java.util.Map; - -/** - * Counts of how many bases of each type have been seen. - * - * @author mhanna - * @version 0.1 - */ -public class Counts implements Cloneable { - /** - * Internal representation of counts, broken down by ASCII value. - */ - private Map counts = new HashMap(); - - /** - * Internal representation of cumulative counts, broken down by ASCII value. - */ - private Map cumulativeCounts = new HashMap(); - - /** - * Create an empty Counts object with values A=0,C=0,G=0,T=0. - */ - public Counts() - { - for(byte base: Bases.instance) { - counts.put(base,0L); - cumulativeCounts.put(base,0L); - } - } - - /** - * Create a counts data structure with the given initial values. - * @param data Count data, broken down by base. - * @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example). - */ - public Counts( long[] data, boolean cumulative ) { - if(cumulative) { - long priorCount = 0; - for(byte base: Bases.instance) { - long count = data[Bases.toPack(base)]; - counts.put(base,count-priorCount); - cumulativeCounts.put(base,priorCount); - priorCount = count; - } - } - else { - long priorCount = 0; - for(byte base: Bases.instance) { - long count = data[Bases.toPack(base)]; - counts.put(base,count); - cumulativeCounts.put(base,priorCount); - priorCount += count; - } - } - } - - /** - * Convert to an array for persistence. - * @param cumulative Use a cumulative representation. - * @return Array of count values. - */ - public long[] toArray(boolean cumulative) { - long[] countArray = new long[counts.size()]; - if(cumulative) { - int index = 0; - boolean first = true; - for(byte base: Bases.instance) { - if(first) { - first = false; - continue; - } - countArray[index++] = getCumulative(base); - } - countArray[countArray.length-1] = getTotal(); - } - else { - int index = 0; - for(byte base: Bases.instance) - countArray[index++] = counts.get(base); - } - return countArray; - } - - /** - * Create a unique copy of the current object. - * @return A duplicate of this object. - */ - public Counts clone() { - Counts other; - try { - other = (Counts)super.clone(); - } - catch(CloneNotSupportedException ex) { - throw new ReviewedStingException("Unable to clone counts object", ex); - } - other.counts = new HashMap(counts); - other.cumulativeCounts = new HashMap(cumulativeCounts); - return other; - } - - /** - * Increment the number of bases seen at the given location. - * @param base Base to increment. - */ - public void increment(byte base) { - counts.put(base,counts.get(base)+1); - boolean increment = false; - for(byte cumulative: Bases.instance) { - if(increment) cumulativeCounts.put(cumulative,cumulativeCounts.get(cumulative)+1); - increment |= (cumulative == base); - } - } - - /** - * Gets a count of the number of bases seen at a given location. - * Note that counts in this case are not cumulative (counts for A,C,G,T - * are independent). - * @param base Base for which to query counts. - * @return Number of bases of this type seen. - */ - public long get(byte base) { - return counts.get(base); - } - - /** - * Gets a count of the number of bases seen before this base. - * Note that counts in this case are cumulative. - * @param base Base for which to query counts. - * @return Number of bases of this type seen. - */ - public long getCumulative(byte base) { - return cumulativeCounts.get(base); - } - - /** - * How many total bases are represented by this count structure? - * @return Total bases represented. - */ - public long getTotal() { - int accumulator = 0; - for(byte base: Bases.instance) { - accumulator += get(base); - } - return accumulator; - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java deleted file mode 100755 index 92bb713f0..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITHoc THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment.reference.bwt; - -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.reference.ReferenceSequence; - -import java.io.*; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.reference.packing.PackUtils; - -/** - * Create a suffix array data structure. - * - * @author mhanna - * @version 0.1 - */ -public class CreateBWTFromReference { - private byte[] loadReference( File inputFile ) { - // Read in the first sequence in the input file - ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); - ReferenceSequence sequence = reference.nextSequence(); - return sequence.getBases(); - } - - private byte[] loadReverseReference( File inputFile ) { - ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); - ReferenceSequence sequence = reference.nextSequence(); - PackUtils.reverse(sequence.getBases()); - return sequence.getBases(); - } - - private Counts countOccurrences( byte[] sequence ) { - Counts occurrences = new Counts(); - for( byte base: sequence ) - occurrences.increment(base); - return occurrences; - } - - private long[] createSuffixArray( byte[] sequence ) { - return SuffixArray.createFromReferenceSequence(sequence).sequence; - } - - private long[] invertSuffixArray( long[] suffixArray ) { - long[] inverseSuffixArray = new long[suffixArray.length]; - for( int i = 0; i < suffixArray.length; i++ ) - inverseSuffixArray[(int)suffixArray[i]] = i; - return inverseSuffixArray; - } - - private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) { - long[] compressedSuffixArray = new long[suffixArray.length]; - compressedSuffixArray[0] = inverseSuffixArray[0]; - for( int i = 1; i < suffixArray.length; i++ ) - compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1]; - return compressedSuffixArray; - } - - private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) { - long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length]; - for( int i = 0; i < compressedSuffixArray.length; i++ ) - inverseCompressedSuffixArray[compressedSuffixArray[i]] = i; - return inverseCompressedSuffixArray; - } - - public static void main( String argv[] ) throws IOException { - if( argv.length != 5 ) { - System.out.println("USAGE: CreateBWTFromReference .fasta "); - return; - } - - String inputFileName = argv[0]; - File inputFile = new File(inputFileName); - - String bwtFileName = argv[1]; - File bwtFile = new File(bwtFileName); - - String rbwtFileName = argv[2]; - File rbwtFile = new File(rbwtFileName); - - String saFileName = argv[3]; - File saFile = new File(saFileName); - - String rsaFileName = argv[4]; - File rsaFile = new File(rsaFileName); - - CreateBWTFromReference creator = new CreateBWTFromReference(); - - byte[] sequence = creator.loadReference(inputFile); - byte[] reverseSequence = creator.loadReverseReference(inputFile); - - // Count the occurences of each given base. - Counts occurrences = creator.countOccurrences(sequence); - System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A), - occurrences.getCumulative(Bases.C), - occurrences.getCumulative(Bases.G), - occurrences.getCumulative(Bases.T)); - - // Generate the suffix array and print diagnostics. - long[] suffixArrayData = creator.createSuffixArray(sequence); - long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence); - - // Invert the suffix array and print diagnostics. - long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData); - long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData); - - SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData ); - SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData ); - - /* - // Create the data structure for the compressed suffix array and print diagnostics. - int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray); - int reconstructedInverseSA = compressedSuffixArray[0]; - for( int i = 0; i < 8; i++ ) { - System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA); - reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA]; - } - - // Create the data structure for the inverse compressed suffix array and print diagnostics. - int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray); - for( int i = 0; i < 8; i++ ) { - System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]); - } - */ - - // Create the BWT. - BWT bwt = BWT.createFromReferenceSequence(sequence); - BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence); - - byte[] bwtSequence = bwt.getSequence(); - System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length()); - - BWTWriter bwtWriter = new BWTWriter(bwtFile); - bwtWriter.write(bwt); - bwtWriter.close(); - - BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile); - reverseBWTWriter.write(reverseBWT); - reverseBWTWriter.close(); - - /* - SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); - saWriter.write(suffixArray); - saWriter.close(); - - SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile); - reverseSAWriter.write(reverseSuffixArray); - reverseSAWriter.close(); - */ - - File existingBWTFile = new File(inputFileName+".bwt"); - BWTReader existingBWTReader = new BWTReader(existingBWTFile); - BWT existingBWT = existingBWTReader.read(); - - byte[] existingBWTSequence = existingBWT.getSequence(); - System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length()); - - for( int i = 0; i < bwt.length(); i++ ) { - if( bwtSequence[i] != existingBWTSequence[i] ) - throw new ReviewedStingException("BWT mismatch at " + i); - } - - File existingSAFile = new File(inputFileName+".sa"); - SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT); - SuffixArray existingSuffixArray = existingSuffixArrayReader.read(); - - for(int i = 0; i < suffixArray.length(); i++) { - if( i % 10000 == 0 ) - System.out.printf("Validating suffix array entry %d%n", i); - if( suffixArray.get(i) != existingSuffixArray.get(i) ) - throw new ReviewedStingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i))); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java deleted file mode 100644 index 13714de1e..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -/** - * Models a block of bases within the BWT. - */ -public class SequenceBlock { - /** - * Start position of this sequence within the BWT. - */ - public final int sequenceStart; - - /** - * Length of this sequence within the BWT. - */ - public final int sequenceLength; - - - /** - * Occurrences of each letter up to this sequence block. - */ - public final Counts occurrences; - - /** - * Sequence for this segment. - */ - public final byte[] sequence; - - /** - * Create a new block within this BWT. - * @param sequenceStart Starting position of this sequence within the BWT. - * @param sequenceLength Length of this sequence. - * @param occurrences How many of each base has been seen before this sequence began. - * @param sequence The actual sequence from the BWT. - */ - public SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) { - this.sequenceStart = sequenceStart; - this.sequenceLength = sequenceLength; - this.occurrences = occurrences; - this.sequence = sequence; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java deleted file mode 100644 index dba3633d1..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java +++ /dev/null @@ -1,159 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Comparator; -import java.util.TreeSet; - -import net.sf.samtools.util.StringUtil; - -/** - * An in-memory representation of a suffix array. - * - * @author mhanna - * @version 0.1 - */ -public class SuffixArray { - public final long inverseSA0; - public final Counts occurrences; - - /** - * The elements of the sequence actually stored in memory. - */ - protected final long[] sequence; - - /** - * How often are individual elements in the sequence actually stored - * in memory, as opposed to being calculated on the fly? - */ - protected final int sequenceInterval; - - /** - * The BWT used to calculate missing portions of the sequence. - */ - protected final BWT bwt; - - public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence) { - this(inverseSA0,occurrences,sequence,1,null); - } - - /** - * Creates a new sequence array with the given inverse SA, occurrences, and values. - * @param inverseSA0 Inverse SA entry for the first element. - * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. - * @param sequence The full suffix array. - * @param sequenceInterval How frequently is the sequence interval stored. - * @param bwt bwt used to infer the remaining entries in the BWT. - */ - public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence, int sequenceInterval, BWT bwt) { - this.inverseSA0 = inverseSA0; - this.occurrences = occurrences; - this.sequence = sequence; - this.sequenceInterval = sequenceInterval; - this.bwt = bwt; - - if(sequenceInterval != 1 && bwt == null) - throw new ReviewedStingException("A BWT must be provided if the sequence interval is not 1"); - } - - /** - * Retrieves the length of the sequence array. - * @return Length of the suffix array. - */ - public long length() { - if( bwt != null ) - return bwt.length()+1; - else - return sequence.length; - } - - /** - * Get the suffix array value at a given sequence. - * @param index Index at which to retrieve the suffix array vaule. - * @return The suffix array value at that entry. - */ - public long get(long index) { - int iterations = 0; - while(index%sequenceInterval != 0) { - // The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case. - if(index == inverseSA0) - index = 0; - else { - byte base = bwt.getBase(index); - index = bwt.counts(base) + bwt.occurrences(base,index); - } - iterations++; - } - return (sequence[(int)(index/sequenceInterval)]+iterations) % length(); - } - - /** - * Create a suffix array from a given reference sequence. - * @param sequence The reference sequence to use when building the suffix array. - * @return a constructed suffix array. - */ - public static SuffixArray createFromReferenceSequence(byte[] sequence) { - // The builder for the suffix array. Use an integer in this case because - // Java arrays can only hold an integer. - TreeSet suffixArrayBuilder = new TreeSet(new SuffixArrayComparator(sequence)); - - Counts occurrences = new Counts(); - for( byte base: sequence ) - occurrences.increment(base); - - // Build out the suffix array using a custom comparator. - for( int i = 0; i <= sequence.length; i++ ) - suffixArrayBuilder.add(i); - - // Copy the suffix array into an array. - long[] suffixArray = new long[suffixArrayBuilder.size()]; - int i = 0; - for( Integer element: suffixArrayBuilder ) - suffixArray[i++] = element; - - // Find the first element in the inverse suffix array. - long inverseSA0 = -1; - for(i = 0; i < suffixArray.length; i++) { - if(suffixArray[i] == 0) - inverseSA0 = i; - } - if(inverseSA0 < 0) - throw new ReviewedStingException("Unable to find first inverse SA entry in generated suffix array."); - - return new SuffixArray(inverseSA0,occurrences,suffixArray); - } - - /** - * Compares two suffix arrays of the given sequence. Will return whichever string appears - * first in lexicographic order. - */ - private static class SuffixArrayComparator implements Comparator { - /** - * The data source for all suffix arrays. - */ - private final String sequence; - - /** - * Create a new comparator. - * @param sequence Reference sequence to use as basis for comparison. - */ - public SuffixArrayComparator( byte[] sequence ) { - // Processing the suffix array tends to be easier as a string. - this.sequence = StringUtil.bytesToString(sequence); - } - - /** - * Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of - * the two substrings sequence[lhs:], sequence[rhs:]. - * @param lhs Left-hand side of comparison. - * @param rhs Right-hand side of comparison. - * @return How the suffix arrays represented by lhs, rhs compare. - */ - public int compare( Integer lhs, Integer rhs ) { - String lhsSuffixArray = sequence.substring(lhs); - String rhsSuffixArray = sequence.substring(rhs); - return lhsSuffixArray.compareTo(rhsSuffixArray); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java deleted file mode 100644 index c10984145..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream; -import org.broadinstitute.sting.alignment.reference.packing.PackUtils; - -import java.io.*; -import java.nio.ByteOrder; - -/** - * A reader for suffix arrays in permanent storage. - * - * @author mhanna - * @version 0.1 - */ -public class SuffixArrayReader { - /** - * Input stream from which to read suffix array data. - */ - private FileInputStream inputStream; - - /** - * BWT to use to fill in missing data. - */ - private BWT bwt; - - /** - * Create a new suffix array reader. - * @param inputFile File in which the suffix array is stored. - * @param bwt BWT to use when filling in missing data. - */ - public SuffixArrayReader(File inputFile, BWT bwt) { - try { - this.inputStream = new FileInputStream(inputFile); - this.bwt = bwt; - } - catch( FileNotFoundException ex ) { - throw new ReviewedStingException("Unable to open input file", ex); - } - } - - /** - * Read a suffix array from the input stream. - * @return The suffix array stored in the input stream. - */ - public SuffixArray read() { - UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); - - long inverseSA0; - long[] occurrences; - long[] suffixArray; - int suffixArrayInterval; - - try { - inverseSA0 = uintPackedInputStream.read(); - occurrences = new long[PackUtils.ALPHABET_SIZE]; - uintPackedInputStream.read(occurrences); - // Throw away the suffix array size in bytes and use the occurrences table directly. - suffixArrayInterval = (int)uintPackedInputStream.read(); - suffixArray = new long[(int)((occurrences[occurrences.length-1]+suffixArrayInterval-1)/suffixArrayInterval)]; - uintPackedInputStream.read(suffixArray); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to read BWT from input stream.", ex); - } - - return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt); - } - - - /** - * Close the input stream. - */ - public void close() { - try { - inputStream.close(); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to close input file", ex); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java deleted file mode 100644 index 972fc2a15..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.bwt; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream; - -import java.io.*; -import java.nio.ByteOrder; - -/** - * Javadoc goes here. - * - * @author mhanna - * @version 0.1 - */ -public class SuffixArrayWriter { - /** - * Input stream from which to read suffix array data. - */ - private OutputStream outputStream; - - /** - * Create a new suffix array reader. - * @param outputFile File in which the suffix array is stored. - */ - public SuffixArrayWriter( File outputFile ) { - try { - this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); - } - catch( FileNotFoundException ex ) { - throw new ReviewedStingException("Unable to open input file", ex); - } - } - - /** - * Write a suffix array to the output stream. - * @param suffixArray suffix array to write. - */ - public void write(SuffixArray suffixArray) { - UnsignedIntPackedOutputStream uintPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); - - try { - uintPackedOutputStream.write(suffixArray.inverseSA0); - uintPackedOutputStream.write(suffixArray.occurrences.toArray(true)); - // How frequently the suffix array entry is placed. - uintPackedOutputStream.write(1); - // Length of the suffix array. - uintPackedOutputStream.write(suffixArray.length()-1); - uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to read BWT from input stream.", ex); - } - } - - - /** - * Close the input stream. - */ - public void close() { - try { - outputStream.close(); - } - catch( IOException ex ) { - throw new ReviewedStingException("Unable to close input file", ex); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java deleted file mode 100644 index 6681e37ec..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.packing; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.*; -import java.nio.ByteOrder; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; - -/** - * Reads a packed version of the input stream. - * - * @author mhanna - * @version 0.1 - */ -public class BasePackedInputStream { - /** - * Type of object to unpack. - */ - private final Class type; - - /** - * Ultimate source for packed bases. - */ - private final FileInputStream targetInputStream; - - /** - * Channel source for packed bases. - */ - private final FileChannel targetInputChannel; - - /** - * A fixed-size buffer for word-packed data. - */ - private final ByteOrder byteOrder; - - /** - * How many bases are in a given packed word. - */ - private final int basesPerPackedWord = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE; - - /** - * How many bytes in an integer? - */ - private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE; - - - public BasePackedInputStream( Class type, File inputFile, ByteOrder byteOrder ) throws FileNotFoundException { - this(type,new FileInputStream(inputFile),byteOrder); - } - - public BasePackedInputStream( Class type, FileInputStream inputStream, ByteOrder byteOrder ) { - if( type != Integer.class ) - throw new ReviewedStingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName()); - this.type = type; - this.targetInputStream = inputStream; - this.targetInputChannel = inputStream.getChannel(); - this.byteOrder = byteOrder; - } - - /** - * Read the entire contents of the input stream. - * @param bwt array into which bases should be read. - * @throws IOException if an I/O error occurs. - */ - public void read(byte[] bwt) throws IOException { - read(bwt,0,bwt.length); - } - - /** - * Read the next length bases into the bwt array, starting at the given offset. - * @param bwt array holding the given data. - * @param offset target position in the bases array into which bytes should be written. - * @param length number of bases to read from the stream. - * @throws IOException if an I/O error occurs. - */ - public void read(byte[] bwt, int offset, int length) throws IOException { - int bufferWidth = ((bwt.length+basesPerPackedWord-1)/basesPerPackedWord)*bytesPerInteger; - ByteBuffer buffer = ByteBuffer.allocate(bufferWidth).order(byteOrder); - targetInputChannel.read(buffer); - targetInputChannel.position(targetInputChannel.position()+buffer.remaining()); - buffer.flip(); - - int packedWord = 0; - int i = 0; - while(i < length) { - if(i % basesPerPackedWord == 0) packedWord = buffer.getInt(); - int position = basesPerPackedWord - i%basesPerPackedWord - 1; - bwt[offset+i++] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3)); - } - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java deleted file mode 100644 index c62f40e51..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java +++ /dev/null @@ -1,140 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.packing; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -/** - * A general-purpose stream for writing packed bases. - * - * @author mhanna - * @version 0.1 - */ -public class BasePackedOutputStream { - /** - * Type of object to pack. - */ - private final Class type; - - /** - * How many bases can be stored in the given data structure? - */ - private final int basesPerType; - - /** - * Ultimate target for the packed bases. - */ - private final OutputStream targetOutputStream; - - /** - * A fixed-size buffer for word-packed data. - */ - private final ByteBuffer buffer; - - public BasePackedOutputStream( Class type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException { - this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder); - } - - /** - * Write packed bases to the given output stream. - * @param type Type of data to pack bases into. - * @param outputStream Output stream to which to write packed bases. - * @param byteOrder Switch between big endian / little endian when reading / writing files. - */ - public BasePackedOutputStream( Class type, OutputStream outputStream, ByteOrder byteOrder) { - this.targetOutputStream = outputStream; - this.type = type; - basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE; - this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder); - } - - /** - * Writes the given base to the output stream. Will write only this base; no packing will be performed. - * @param base List of bases to write. - * @throws IOException if an I/O error occurs. - */ - public void write( int base ) throws IOException { - write( new byte[] { (byte)base } ); - } - - /** - * Writes an array of bases to the target output stream. - * @param bases List of bases to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte[] bases ) throws IOException { - write(bases,0,bases.length); - } - - /** - * Writes a subset of the array of bases to the output stream. - * @param bases List of bases to write. - * @param offset site at which to start writing. - * @param length number of bases to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte[] bases, int offset, int length ) throws IOException { - int packedBases = 0; - int positionInPack = 0; - - for( int base = offset; base < offset+length; base++ ) { - packedBases = packBase(bases[base], packedBases, positionInPack); - - // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. - positionInPack = ++positionInPack % basesPerType; - if( positionInPack == 0 ) { - writePackedBases(packedBases); - packedBases = 0; - } - } - - if( positionInPack > 0 ) - writePackedBases(packedBases); - } - - /** - * Flush the contents of the OutputStream to disk. - * @throws IOException if an I/O error occurs. - */ - public void flush() throws IOException { - targetOutputStream.flush(); - } - - /** - * Closes the given output stream. - * @throws IOException if an I/O error occurs. - */ - public void close() throws IOException { - targetOutputStream.close(); - } - - /** - * Pack the given base into the basepack. - * @param base The base to pack. - * @param basePack Target for the pack operation. - * @param position Position within the pack to which to add the base. - * @return The packed integer. - */ - private int packBase( byte base, int basePack, int position ) { - basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1)); - return basePack; - } - - /** - * Write the given packed base structure to the output file. - * @param packedBases Packed bases to write. - * @throws IOException on error writing to the file. - */ - private void writePackedBases(int packedBases) throws IOException { - buffer.rewind(); - if( type == Integer.class ) - buffer.putInt(packedBases); - else if( type == Byte.class ) - buffer.put((byte)packedBases); - else - throw new ReviewedStingException("Cannot pack bases into type " + type.getName()); - targetOutputStream.write(buffer.array()); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java deleted file mode 100755 index 8211c97d8..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment.reference.packing; - -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.reference.ReferenceSequence; - -import java.io.*; -import java.nio.ByteOrder; - -/** - * Generate a .PAC file from a given reference. - * - * @author hanna - * @version 0.1 - */ - -public class CreatePACFromReference { - public static void main( String argv[] ) throws IOException { - if( argv.length != 3 ) { - System.out.println("USAGE: CreatePACFromReference .fasta "); - return; - } - - // Read in the first sequence in the input file - String inputFileName = argv[0]; - File inputFile = new File(inputFileName); - ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); - ReferenceSequence sequence = reference.nextSequence(); - - // Target file for output - PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() ); - - // Reverse the bases in the reference - PackUtils.reverse(sequence.getBases()); - - // Target file for output - PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() ); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java deleted file mode 100644 index beed21b49..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.packing; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.io.FileOutputStream; -import java.nio.ByteOrder; - -/** - * Utilities designed for packing / unpacking bases. - * - * @author mhanna - * @version 0.1 - */ -public class PackUtils { - /** - * How many possible bases can be encoded? - */ - public static final int ALPHABET_SIZE = 4; - - /** - * How many bits does it take to store a single base? - */ - public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2)); - - /** - * How many bits fit into a single byte? - */ - public static final int BITS_PER_BYTE = 8; - - /** - * Writes a reference sequence to a PAC file. - * @param outputFile Filename for the PAC file. - * @param referenceSequence Reference sequence to write. - * @throws IOException If there's a problem writing to the output file. - */ - public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException { - OutputStream outputStream = new FileOutputStream(outputFile); - - BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); - basePackedOutputStream.write(referenceSequence); - - outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE); - - outputStream.close(); - } - - - /** - * How many bits can a given type hold? - * @param type Type to test. - * @return Number of bits that the given type can hold. - */ - public static int bitsInType( Class type ) { - try { - long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1; - long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1; - if( typeSize > intTypeSize ) - throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName()); - return (int)(Math.log(typeSize)/Math.log(2)); - } - catch( NoSuchFieldException ex ) { - throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex); - } - catch( IllegalAccessException ex ) { - throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex); - } - } - - /** - * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. - * @param base ASCII value for the base to pack. - * @return A byte from 0-3 indicating the base's packed value. - */ - public static byte packBase(byte base) { - switch( base ) { - case 'A': - return 0; - case 'C': - return 1; - case 'G': - return 2; - case 'T': - return 3; - default: - throw new ReviewedStingException("Unknown base type: " + base); - } - } - - /** - * Converts a two-bit representation of a base into an ASCII representation of a base. - * @param pack Byte from 0-3 indicating which base is represented. - * @return An ASCII value representing the packed base. - */ - public static byte unpackBase(byte pack) { - switch( pack ) { - case 0: - return 'A'; - case 1: - return 'C'; - case 2: - return 'G'; - case 3: - return 'T'; - default: - throw new ReviewedStingException("Unknown pack type: " + pack); - } - } - - /** - * Reverses an unpacked sequence of bases. - * @param bases bases to reverse. - */ - public static void reverse( byte[] bases ) { - for( int i = 0, j = bases.length-1; i < j; i++, j-- ) { - byte temp = bases[j]; - bases[j] = bases[i]; - bases[i] = temp; - } - } - - /** - * Given a structure of size size that should be split - * into partitionSize partitions, how many partitions should - * be created? Size of last partition will be <= partitionSize. - * @param size Total size of the data structure. - * @param partitionSize Size of an individual partition. - * @return Number of partitions that would be created. - */ - public static int numberOfPartitions( long size, long partitionSize ) { - return (int)((size+partitionSize-1) / partitionSize); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java deleted file mode 100644 index c07766ee1..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java +++ /dev/null @@ -1,102 +0,0 @@ -package org.broadinstitute.sting.alignment.reference.packing; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; - -/** - * Read a set of integers packed into - * - * @author mhanna - * @version 0.1 - */ -public class UnsignedIntPackedInputStream { - /** - * Ultimate target for the occurrence array. - */ - private final FileInputStream targetInputStream; - - /** - * Target channel from which to pull file data. - */ - private final FileChannel targetInputChannel; - - /** - * The byte order in which integer input data appears. - */ - private final ByteOrder byteOrder; - - /** - * How many bytes are required to store an integer? - */ - private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE; - - /** - * Create a new PackedIntInputStream, writing to the given target file. - * @param inputFile target input file. - * @param byteOrder Endianness to use when writing a list of integers. - * @throws java.io.IOException if an I/O error occurs. - */ - public UnsignedIntPackedInputStream(File inputFile, ByteOrder byteOrder) throws IOException { - this(new FileInputStream(inputFile),byteOrder); - } - - /** - * Read ints from the given InputStream. - * @param inputStream Input stream from which to read ints. - * @param byteOrder Endianness to use when writing a list of integers. - */ - public UnsignedIntPackedInputStream(FileInputStream inputStream, ByteOrder byteOrder) { - this.targetInputStream = inputStream; - this.targetInputChannel = inputStream.getChannel(); - this.byteOrder = byteOrder; - } - - /** - * Read a datum from the input stream. - * @return The next input datum in the stream. - * @throws IOException if an I/O error occurs. - */ - public long read() throws IOException { - long[] data = new long[1]; - read(data); - return data[0]; - } - - /** - * Read the data from the input stream. - * @param data placeholder for input data. - * @throws IOException if an I/O error occurs. - */ - public void read( long[] data ) throws IOException { - read( data, 0, data.length ); - } - - /** - * Read the data from the input stream, starting at the given offset. - * @param data placeholder for input data. - * @param offset place in the array to start reading in data. - * @param length number of ints to read in. - * @throws IOException if an I/O error occurs. - */ - public void read( long[] data, int offset, int length ) throws IOException { - ByteBuffer readBuffer = ByteBuffer.allocate(bytesPerInteger*length).order(byteOrder); - - targetInputChannel.read(readBuffer,targetInputChannel.position()); - readBuffer.flip(); - targetInputChannel.position(targetInputChannel.position()+readBuffer.remaining()); - - int i = 0; - while(i < length) - data[offset+i++] = readBuffer.getInt() & 0xFFFFFFFFL; - } - - /** - * Closes the given output stream. - * @throws IOException if an I/O error occurs. - */ - public void close() throws IOException { - targetInputStream.close(); - } -} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java deleted file mode 100755 index 9d7853695..000000000 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment.reference.packing; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -/** - * Writes an list of integers to the output file. - * - * @author mhanna - * @version 0.1 - */ -public class UnsignedIntPackedOutputStream { - /** - * Ultimate target for the occurrence array. - */ - private final OutputStream targetOutputStream; - - /** - * A fixed-size buffer for int-packed data. - */ - private final ByteBuffer buffer; - - /** - * Create a new PackedIntOutputStream, writing to the given target file. - * @param outputFile target output file. - * @param byteOrder Endianness to use when writing a list of integers. - * @throws IOException if an I/O error occurs. - */ - public UnsignedIntPackedOutputStream(File outputFile, ByteOrder byteOrder) throws IOException { - this(new FileOutputStream(outputFile),byteOrder); - } - - /** - * Write packed ints to the given OutputStream. - * @param outputStream Output stream to which to write packed ints. - * @param byteOrder Endianness to use when writing a list of integers. - */ - public UnsignedIntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) { - this.targetOutputStream = outputStream; - buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder); - } - - /** - * Write the data to the output stream. - * @param datum datum to write. - * @throws IOException if an I/O error occurs. - */ - public void write( long datum ) throws IOException { - buffer.rewind(); - buffer.putInt((int)datum); - targetOutputStream.write(buffer.array()); - } - - /** - * Write the data to the output stream. - * @param data data to write. occurrences.length must match alphabet size. - * @throws IOException if an I/O error occurs. - */ - public void write( long[] data ) throws IOException { - for(long datum: data) - write(datum); - } - - /** - * Write the given chunk of data to the input stream. - * @param data data to write. - * @param offset position at which to start. - * @param length number of ints to write. - * @throws IOException if an I/O error occurs. - */ - public void write( long[] data, int offset, int length ) throws IOException { - for( int i = offset; i < offset+length; i++ ) - write(data[i]); - } - - /** - * Flush the contents of the OutputStream to disk. - * @throws IOException if an I/O error occurs. - */ - public void flush() throws IOException { - targetOutputStream.flush(); - } - - /** - * Closes the given output stream. - * @throws IOException if an I/O error occurs. - */ - public void close() throws IOException { - targetOutputStream.close(); - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/filters/ContaminatedSampleFilter.java b/java/src/org/broadinstitute/sting/oneoffprojects/filters/ContaminatedSampleFilter.java deleted file mode 100644 index bde03208a..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/filters/ContaminatedSampleFilter.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.filters; - -import net.sf.picard.filter.SamRecordFilter; -import net.sf.samtools.SAMRecord; - -import java.util.Arrays; -import java.util.HashSet; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Feb 8, 2010 - */ -public class ContaminatedSampleFilter implements SamRecordFilter { - - private final String[] filteredNames = {"NA19562","NA19006","NA19554","NA18985","NA18988","NA19062","NA19559"}; - - private HashSet contaminatedSamples = new HashSet( Arrays.asList(filteredNames) ); - - public boolean filterOut(SAMRecord read) { - return contaminatedSamples.contains(read.getReadGroup().getSample()); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java b/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java deleted file mode 100755 index 464967312..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.tools; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineProgram; - -import java.io.*; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.List; - -/** - * Used to test how long it takes to read through a text files and gzipped files. - * If the passed-in filename ends with .gz, it will be read using GZIPInputStream. - * Otherwise, its read using FileReader. - */ -public class CompareBAMAlignments extends CommandLineProgram { - - @Argument(fullName = "input", shortName = "i", doc = "xxx", required = true) - public List filenames; - - @Argument(fullName = "maxIsize", shortName = "s", doc = "xxx", required=false) - public int maxISize = -1; - - @Argument(fullName = "incr", shortName = "incr", doc = "xxx", required=false) - int incr = -1; - - @Override - protected int execute() { - try { - List> readers = new ArrayList>(); - for ( String filename : filenames ) { - final File file = new File(filename); - SAMFileReader reader = new SAMFileReader(file); - reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - readers.add(reader.iterator()); - } - - System.out.println("Reading..."); - int next = incr; - int counter = 0; - - while( true ) { - List reads = new ArrayList(); - for ( Iterator reader : readers ) { - if ( ! reader.hasNext() ) System.exit(0); - reads.add(reader.next()); - } - - // comparing - SAMRecord read1 = reads.get(0); - if ( read1.getInferredInsertSize() > maxISize ) { - for ( SAMRecord read : reads ) { - if(incr > 0 && counter % incr == 0) { - next += incr; - System.err.println(new Date() + " - counter " + counter); - System.err.println("read: " + read.format()); - } - - if ( ! read1.getReadName().equals(read.getReadName()) ) - bad(read1, read, "Names not equal"); - else { - if ( read1.getAlignmentStart() != read.getAlignmentStart() ) - bad(read1, read, "Alignment starts not equal"); - if ( ! read1.getCigarString().equals(read.getCigarString()) ) - bad(read1, read, "Unequal CIGAR strings"); - } - } - } - counter++; - } - } catch(Exception e) { - System.err.println("ERROR: " + e); - e.printStackTrace(); - } - - return 0; - } - - private void bad(SAMRecord read1, SAMRecord read2, String msg) { - System.out.printf("%nBAD: %s%n", msg); - System.out.printf(" read1: %s %s %s %s%n", read1.getReadName(), read1.getAlignmentStart(), read1.getCigarString(), read1.getInferredInsertSize()); - System.out.printf(" read2: %s %s %s %s%n", read2.getReadName(), read2.getAlignmentStart(), read2.getCigarString(), read2.getInferredInsertSize()); - // System.exit(1); - } - - public static void main(String args[]) - { - try { - CommandLineProgram.start(new CompareBAMAlignments(), args); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/tools/JavaIOSpeedTest.java b/java/src/org/broadinstitute/sting/oneoffprojects/tools/JavaIOSpeedTest.java deleted file mode 100755 index 47ee352d4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/tools/JavaIOSpeedTest.java +++ /dev/null @@ -1,76 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.tools; - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.Date; -import java.util.zip.GZIPInputStream; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineProgram; - -/** - * Used to test how long it takes to read through a text files and gzipped files. - * If the passed-in filename ends with .gz, it will be read using GZIPInputStream. - * Otherwise, its read using FileReader. - */ -public class JavaIOSpeedTest extends CommandLineProgram { - - @Argument(fullName = "input", shortName = "i", doc = "file to read", required = true) public String filename; - @Argument(fullName = "buffer_size", shortName = "s", doc = "read buffer size in mb", required=false) public int bufferSize = -1; - - @Override - protected int execute() { - System.out.println("Filename: " + filename); - try { - final File file = new File(filename); - final Reader reader; - if(filename.endsWith(".gz")) { - reader = new InputStreamReader(new GZIPInputStream(new BufferedInputStream(new FileInputStream(file)))); - } else { - reader = new FileReader(file); - } - - final BufferedReader br; - if(bufferSize != -1) { - br = new BufferedReader(reader, bufferSize * 1000000); - } else { - br = new BufferedReader(reader); - } - - System.out.println("Reading..."); - int incr = 10000000; - int next = incr; - int counter = 0; - while(br.ready()) { - br.readLine(); - if(++counter == next) { - next += incr; - System.err.println(new Date() + " - file: " + filename + ", buffer size: " + bufferSize + "mb, read " + counter + " lines..."); - } - } - - System.out.println("Read " + counter + " lines from " + filename); - - } catch(Exception e) { - System.err.println("ERROR: " + e); - e.printStackTrace(); - } - - return 0; - } - - public static void main(String args[]) - { - try { - CommandLineProgram.start(new JavaIOSpeedTest(), args); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java deleted file mode 100644 index c56ee5ea2..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 3, 2010 - * Time: 4:10:42 PM - * To change this template use File | Settings | File Templates. - */ - -/** A simple utility class that encapsulates information about a single alignment (offset, strand, overlap, mismatch count). - * - */ -public class AlignmentInfo { - private int offset; - private int overlap; - private int mm ; - private Assembly a = null; - - private static int RIDICULOUSLY_LARGE_NUMBER = 1000000000; - - public AlignmentInfo() { - offset = 0; - mm = RIDICULOUSLY_LARGE_NUMBER; - overlap = -1; - a = null; - } - - public AlignmentInfo(int mm, int offset, boolean isRc, int overlap, Assembly a) { - this.offset = (isRc ? (-offset-1) : offset ); - this.overlap = overlap; - this.mm = mm; - this.a = a; - } - - boolean isAligned() { return mm < RIDICULOUSLY_LARGE_NUMBER; } - - public boolean isNegativeStrand() { return offset < 0; } - public Assembly getAssembly() { return a; } - public int getOffset() { return ( offset < 0 ? (-offset-1) : offset ); } - public int getMismatchCount() { return mm; } - public int getOverlap() { return overlap; } - public double getMismatchRate() { return isAligned() ? ((double)mm)/overlap : 1.0; } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java deleted file mode 100644 index 85dfa8984..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; - -public class AlignmentList implements Iterable { - private int best_mm = 1000000000; - private int next_best_mm = 1000000000; - private List als = null; - private int next_best_count = 0; - private int best_overlap = 0; - - private AlignmentStrategy strategy = null; - - public AlignmentList(AlignmentStrategy s) { - this.strategy = s; - best_mm = 1000000000; - next_best_mm = 1000000000; - best_overlap = 0; - als = new ArrayList(1); - } - - public boolean isAligned() { - return best_mm < 1000000000; - } - - public List getAlignments() { return als; } - - public int size() { return als.size(); } - - public Iterator iterator() { return als.iterator(); } - - // public void tryAdd(int mm, int offset, boolean isRc, int overlap) { - // tryAdd(new AlignmentInfo(mm,offset,isRc,overlap)); - // } - - public void tryAdd(AlignmentInfo ai) { - AlignmentStrategy.Action a = strategy.action(ai,this) ; - switch ( a ) { - case DISCARD: break; - case REPLACE_BEST: - next_best_mm = best_mm; - next_best_count = size(); - als.clear(); - als.add(ai); - best_mm = ai.getMismatchCount(); - best_overlap = ai.getOverlap(); - break; - case ADD_BEST: - als.add(ai); - if ( ai.getMismatchCount() < best_mm ) best_mm = ai.getMismatchCount(); - if ( ai.getOverlap() > best_overlap) best_overlap = ai.getOverlap(); - break; - case REPLACE_NEXTBEST: - next_best_mm = ai.getMismatchCount(); - next_best_count = 1; - break; - case ADD_NEXTBEST: - next_best_count++; - if ( ai.getMismatchCount() < next_best_mm ) next_best_mm = ai.getMismatchCount(); - break; - default: throw new StingException("Unrecognized action requested: "+a); - } - } - - public void tryAddAll(AlignmentList al) { - for( AlignmentInfo ai : al) { - tryAdd(ai); - } - } - - public int getBestMMCount() { return best_mm; } - public int getBestOverlap() { return best_overlap; } - public int getBestHitCount() { return als.size() ; } - public int getNextBestHitCount() { return next_best_count; } - public int getNextBestMMCount() { return next_best_mm; } -// public int getOverlap() { return overlap; } -// public int getOffset() { return offset; } -// public boolean isNegativeStrand() { return rc; } - -// public double getMismatchRate() { return isAligned() ? ((double)best_mm)/overlap : 1.0 ; } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java deleted file mode 100644 index 559a9ff16..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 3, 2010 - * Time: 2:53:43 PM - * To change this template use File | Settings | File Templates. - */ -public interface AlignmentStrategy { - enum Action { - DISCARD, - REPLACE_BEST, - ADD_BEST, - REPLACE_NEXTBEST, - ADD_NEXTBEST - }; - - public Action action(AlignmentInfo alignment, AlignmentList currentList); -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java deleted file mode 100644 index 265cf498e..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.Utils; - -import java.util.List; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; - -import net.sf.samtools.util.StringUtil; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 3, 2010 - * Time: 2:20:22 PM - * To change this template use File | Settings | File Templates. - */ -public class Assembly { - private byte[] consensus; - private short[] coverage; - private short[] mismatches; - private short [][] base_counts; - - private boolean debug = false; - private List seq_ids; - private List seqs; - private List seq_offsets; - - private KmerIndex lookup; // assembled consensus sequence is indexed here - - private int hookedAt = -1; // if set, specifies start on the ref of the assembled consensus sequence - - private static List EMPTY_KMER_LIST = new ArrayList(0); - - private int K = 15; - - private AlignmentStrategy strategy = null; - - /** Creates new assembly seeded with the specified sequence; default key length (15) is used. - * - * @param seq - * @param id - */ - public Assembly(final byte[] seq, String id) { - this(15,seq,id); - } - - /** Creates new assembly seeded with the specified sequence and sets kmer (key) length K for the internally maintained - * lookup index tables. - * @param K - * @param seq - * @param id - */ - public Assembly(int K, final byte[] seq, String id) { - this.K = K; - seq_ids = new ArrayList(); - seq_offsets = new ArrayList(); - seqs = new ArrayList(); - seq_ids.add(id); - seq_offsets.add(0); - seqs.add(seq); - consensus = Arrays.copyOf(seq,seq.length); - coverage = new short[seq.length]; - Arrays.fill(coverage,(short)1); - mismatches = new short[seq.length]; // filled with 0's - base_counts = new short[4][seq.length]; - for ( int i = 0 ; i < seq.length ; i++ ) { - int j = BaseUtils.simpleBaseToBaseIndex(seq[i]); - if ( j != -1) base_counts[j][i] = 1; - } - lookup = new KmerIndex(K,seq); - strategy = new DefaultAlignmentStrategy(); - } - - /** Creates new assembly seeded with the specified sequence; default key length (15) is used and the position on the - * reference of the entire assembly is set (as assemblly grows, position on the ref will be updated properly). - * - * @param seq - * @param id - */ - public Assembly(final byte[] seq, String id, int posOnRef) { - this(seq,id); - hookedAt = posOnRef; - } - - /** Creates new assembly seeded the specified sequence and sets kmer (key) length K for the internally maintained - * lookup index tables. Parameter posOnRef specifies the (initial) position of the entire assembly on the - * ref; as the assembly grows, the position on ref will be updated properly. - * @param K - * @param seq - * @param id - */ - public Assembly(int K, final byte[] seq, String id, int posOnRef) { - this(K,seq,id); - hookedAt = posOnRef; - } - - /** Returns total number of sequences currently held by this assembly. - * - * @return - */ - public int getNumSeqs() { return seqs.size() ; } - - /** Attempts to align seq to this assembly's consensus. Does NOT add - * the sequence to the consensus even if it aligns! This methods returns a list of alternative - * best alignments found (according to the strategy used) in a newly allocated AlignmentList object. - * @param seq sequence to align to this consensus - * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise - * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided, - * along the assembled consensus sequence) - * @return a newly allocated alignment list; returned list can be empty if no alignments are found - */ - public AlignmentList align(final byte[] seq, boolean tryRC) { - return align(seq,tryRC,null); - } - - /** Attempts to align seq to this assembly's consensus. Does NOT add - * the sequence to the consensus even if it aligns! This method uses existing list of alignments - * (which can contain alignments to a different assembly) and updates it as necessary if a better alignment - * (or multiple better alignments) than the one(s) already held in the list is found. Reference to the - * same alignment list object is returned: this method modifies it's argument. If alignment list argument - * is null, new alignment list object will be allocated and returned by this method. - * - * @param seq sequence to align to this consensus - * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise - * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided, - * along the assembled consensus sequence) - * @return a newly allocated alignment list; returned list can be empty if no alignments are found - */ - public AlignmentList align(final byte[] seq, boolean tryRC, AlignmentList a) { - if ( debug ) System.out.println("Assembly:: aligning sequence of length "+seq.length+"; tryRC="+tryRC+"; K="+K); - - List fw_kmers = KmerIndex.toKeyOffsetList(K,seq); - - if ( debug ) { - for( PrimitivePair.Int kmer: fw_kmers) { - System.out.println("id="+kmer.getFirst()+" seq="+new String(KmerIndex.idToSeq(K,kmer.getFirst()))+" offset on seq="+kmer.getSecond()); - } - } - - byte [] rc_seq = (tryRC ? BaseUtils.simpleReverseComplement(seq) : null ); - List rc_kmers = (tryRC ? KmerIndex.toKeyOffsetList(K,rc_seq) : EMPTY_KMER_LIST ); - - if ( a == null ) a = new AlignmentList(strategy); - - // i is the position on the sequence seq or on its reverse complement - for(PrimitivePair.Int kmer : fw_kmers ) { - - List offsets = lookup.getOffsets(kmer.first); - if ( offsets != null ) { - // kmer present in consensus sequence - for ( int s : offsets ) { // s=offset of the current kmer on the assembled consensus - int trial_offset = s - kmer.second; // offset of the seq on the assembled consensus suggested by current kmer/offset - int trial_mm = countMismatches(seq,trial_offset,a.getNextBestMMCount()); - a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,false,overlap(trial_offset,seq.length),this)); - } - } - } - - for ( PrimitivePair.Int kmer : rc_kmers ) { - - List offsets = lookup.getOffsets(kmer.first); - if ( offsets != null ) { - // kmer present in consensus sequence - for ( int s : offsets ) { - int trial_offset = s - kmer.second; - int trial_mm = countMismatches(rc_seq,trial_offset,a.getNextBestMMCount()); - a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,true,overlap(trial_offset,seq.length),this)); - } - } - } - return a; - } - - public void setDebug(boolean d) { this.debug = d; lookup.setDebug(d);} - - public int numSequences() { return seq_ids.size(); } - - private int overlap(int offset, int seq_length ) { - return Math.min(consensus.length,offset+seq_length)-Math.max(0,offset); - } - - private int countMismatches(final byte seq[], int offset, int cutoff) { - int mm = 0; - - int i ; - if ( offset >= 0 ) i = 0; - else { i = (-offset); offset = 0; } - for ( ; i < seq.length && offset < consensus.length ; i++ , offset++ ) { - if ( seq[i] != consensus[offset] ) { - mm++; - if ( mm > cutoff ) break; - } - } - - return mm; - } - - public byte[] getConsensus() { return consensus; } - - public int getPosOnRef() { return hookedAt; } - - public int getConsensusLength() { return consensus.length; } - - public List getOffsets() { return seq_offsets; } - public int getOffset(int i) { return seq_offsets.get(i); } - - public List getSeqIds() { return Collections.unmodifiableList(seq_ids); } - - /** Adds specified sequence to this assembly according to the provided - * alignment information. Will properly update consensus sequence of this assembly - * and all associated information (mismatches, base counts etc) - * @param seq - * @param id - * @param a - */ - public void add(final byte[] seq, String id, AlignmentInfo a) { - - if ( ! a.isAligned() ) throw new StingException("Can not add sequence to the assembly: provided alignment is empty"); - - seq_ids.add(id); - - int offset = a.getOffset(); - int oldConsensusLength = consensus.length; - - byte [] seq_to_add = ( a.isNegativeStrand() ? BaseUtils.simpleReverseComplement(seq) : seq); - - seqs.add(seq_to_add); - - int pos_on_seq = 0; - int pos_on_cons = 0; - - int leftExtension = 0; // how many bases we added to the consensus on the left - int rightExtension = 0; // how many bases we added to the consensus on the right - - if ( offset < 0 ) { - // if sequence sticks out to the left of the current consensus: - - leftExtension = -offset; - for(int i = 0 ; i < seq_offsets.size() ; i++ ) { - // we are going to extend consensus to the left, so we need to update all current offsets: - seq_offsets.set(i,seq_offsets.get(i)+leftExtension); - } - - if ( hookedAt > 0 ) hookedAt -= leftExtension; - // extend consensus and associated arrays to the left : - - consensus = Utils.extend(consensus,offset,(byte)0); // remember, offset is negative here, extending to the left - coverage = Utils.extend(coverage,offset,(short)1) ; - mismatches = Utils.extend(mismatches,offset,(short)0); - for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],offset,(short)0); - - for ( int j = 0 ; j < -offset ; j++ ) { - consensus[j] = seq_to_add[j]; - int b = BaseUtils.simpleBaseToBaseIndex(seq_to_add[j]); - if ( b != -1 ) base_counts[b][j]=1; - } - - pos_on_seq = pos_on_cons = -offset; - - offset = 0; - } - if ( offset > 0 ) pos_on_cons = offset; - - seq_offsets.add(offset); - - boolean consensus_changed = false; - - for ( ; pos_on_seq < seq_to_add.length && pos_on_cons < consensus.length ; pos_on_seq++, pos_on_cons++ ) { - coverage[pos_on_cons]++; - final byte base = seq_to_add[pos_on_seq]; - final int b = BaseUtils.simpleBaseToBaseIndex(base); - if ( b != -1 ) { - // if base on seq is not a regular base, there is nothing to do; - // otherwise count mismatches and optionally update consensus if current base tips the balance - base_counts[b][pos_on_cons]++; - int maxcount = 0; - int maxb = -1; - for ( int j = 0 ; j < 4 ; j++ ) { - if ( base_counts[j][pos_on_cons] > maxcount ) { - maxcount = base_counts[j][pos_on_cons]; - maxb = j; - } - } - // we are guaranteed here that maxb != -1 since we just added one regular base (the current one) - // few lines above... - byte newbase = BaseUtils.baseIndexToSimpleBase(maxb); - if ( newbase != consensus[pos_on_cons] ) { // need to change the consensus base (will recompute mismatches) - consensus[pos_on_cons] = newbase; - consensus_changed = true; - mismatches[pos_on_cons] = 0; - for ( int i = 0 ; i < 4 ; i++ ) { - if ( i == maxb ) continue; - mismatches[pos_on_cons] += base_counts[i][pos_on_cons]; - } - } else { // consensus base did not change; just increment mismatches if current sequence's base differs from consensus - if ( base != consensus[pos_on_cons]) mismatches[pos_on_cons]++; - } - } - - } - - // Last step: if sequence sticks out of current consensus on the right, we need to extend the latter: - - if ( pos_on_seq < seq_to_add.length ) { - // sequence sticks out of consensus to the right - rightExtension = seq_to_add.length - pos_on_seq; - consensus = Utils.extend(consensus,rightExtension,(byte)0); - coverage = Utils.extend(coverage,rightExtension,(short)1); - mismatches = Utils.extend(mismatches,rightExtension,(short)0); - for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],rightExtension,(short)0); - for ( ; pos_on_seq < seq_to_add.length ; pos_on_seq++, pos_on_cons++ ) { - byte base = seq_to_add[pos_on_seq]; - consensus[pos_on_cons] = base; - int b = BaseUtils.simpleBaseToBaseIndex(base); - if ( b != -1 ) base_counts[b][pos_on_cons] = base; - } - } - - // finally, the new sequence we just added could have mismatches that tip some consensus bases into new values; - // let's catch those cases: - - - for ( int i = 0 ; i < consensus.length ; i++ ) { - byte cons_base = consensus[i]; - int b = BaseUtils.simpleBaseToBaseIndex(cons_base); - } - - // there is probably a better way, but for now we just recompute the whole lookup table when consensus - // changes somewhere in the middle (if we want to be samrt we need to identify just the kmers that changed - // and find/change them in the lookup table). - if ( consensus_changed ) { - lookup.clear(); - lookup.index(consensus); - } else { - if ( leftExtension > 0 || rightExtension > 0 ) lookup.updateIndex(consensus,leftExtension,oldConsensusLength); - } - - - } - - public String toAlignmentString(boolean mismatchesOnly, boolean printNames) { - - int maxNameLength = 0; - int spacing=3; - - if ( printNames ) { - for ( String n : seq_ids ) if ( n.length() > maxNameLength ) maxNameLength++; - } - - StringBuilder b = new StringBuilder(); - if ( printNames ) b.append(Utils.dupString(' ',maxNameLength+spacing)); - b.append(new String(consensus)); - b.append('\n'); - - for ( int j = 0; j < seqs.size() ; j++ ) { - int offset = seq_offsets.get(j); - byte [] seq = seqs.get(j); - - if ( printNames ) { - b.append(seq_ids.get(j)); - b.append(Utils.dupString(' ',maxNameLength-seq_ids.get(j).length()+spacing)); - } - - for ( int i = 0 ; i < offset ; i++ ) b.append(' '); - - for ( int i = 0 ; i < seq.length ; i++ ) { - - byte base = seq[i]; - if ( mismatchesOnly && base == consensus[i+offset] ) { - b.append('.'); - } else b.append((char)base); - } - b.append('\n'); - } - return b.toString(); - } - - - public static void testMe(String [] argv ) { - byte [] seq1 = "ACGTTGCGTGGTTCACTGCAGTAACTGACTGATGCA".getBytes(); - byte [] seq2 = "GCGTGGTTTACTGCAGTAACTGACTGATGCAACGTGTTTG".getBytes(); - byte [] seq3 = "GGNTGACGTTGCGTGGTTTACTGCAGTAACTGACT".getBytes(); - byte [] seq4 = "NNNTTNCGTGGTTTACTGCAGTAACTGACTGATGCA".getBytes(); - - Assembly a = new Assembly(seq1,"1"); - - AlignmentList al = a.align(seq2,false); - if ( al.isAligned() ) System.out.println("seq 2 aligned"); - else System.out.println("seq 2 did NOT align"); - - if ( al.size() == 1 ) a.add(seq2,"2",al.getAlignments().get(0)); - else System.out.println("Multiple alignments found for seq 2"); - - al = a.align(seq3,false); - if ( al.isAligned() ) System.out.println("seq 3 aligned"); - else System.out.println("seq 3 did NOT align"); - - if ( al.size() == 1 ) a.add(seq3,"3",al.getAlignments().get(0)); - else System.out.println("Multiple alignments found for seq 3"); - - al = a.align(seq4,false); - if ( al.isAligned() ) System.out.println("seq 4 aligned"); - else System.out.println("seq 4 did NOT align"); - - if ( al.size() == 1 ) a.add(seq4,"4",al.getAlignments().get(0)); - else System.out.println("Multiple alignments found for seq 4"); - - System.out.println(a.toAlignmentString(true, true)); - - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java deleted file mode 100644 index 302b35c5f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.List; -import java.util.LinkedList; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 13, 2010 - * Time: 5:53:33 PM - * To change this template use File | Settings | File Templates. - */ -public class AssemblyGraph { - - private List sources; - - public AssemblyGraph(Assembly a) { - sources = new LinkedList(); - sources.add(a); - } - - /** Initializes assembly from the single specified read, and sets this assembly as the root of this - * assembly graph - * @param r read; must be aligned, otherwise exception will be thrown - * @param K index (Kmer) size of the assembly that will be initialized with the read r - */ - public AssemblyGraph(SAMRecord r, int K) { - if (AlignmentUtils.isReadUnmapped(r)) - throw new StingException("Can not initialize assembly graph with unaligned read"); - sources = new LinkedList(); - sources.add( new Assembly(K,r.getReadBases(),r.getReadName(), r.getAlignmentStart()) ); - } - - public void add(SAMRecord r) { - - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java deleted file mode 100644 index 35ceef4a4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 3, 2010 - * Time: 4:53:01 PM - * To change this template use File | Settings | File Templates. - */ -public class DefaultAlignmentStrategy implements AlignmentStrategy { - - public Action action(AlignmentInfo alignment, AlignmentList currentList) { - if ( currentList.size() == 0 ) return Action.REPLACE_BEST; - - if ( alignment.getMismatchCount() > currentList.getNextBestMMCount() ) return Action.DISCARD; - - if ( alignment.getMismatchCount() < currentList.getBestMMCount() ) return Action.REPLACE_BEST; - if ( alignment.getMismatchCount() == currentList.getBestMMCount() ) return Action.ADD_BEST; - if ( alignment.getMismatchCount() < currentList.getNextBestMMCount() ) return Action.REPLACE_NEXTBEST; - if ( alignment.getMismatchCount() == currentList.getNextBestMMCount() ) return Action.ADD_NEXTBEST; - - throw new StingException("Unexpected case found and left unprocessed"); -// return null; //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java deleted file mode 100644 index 179c09655..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.util.List; -import java.util.HashMap; -import java.util.ArrayList; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. -* User: asivache -* Date: Aug 3, 2010 -* Time: 1:31:15 PM -* To change this template use File | Settings | File Templates. -*/ -public class KmerIndex { - private HashMap > lookup; - private int K = -1; - private int mask = 0; - private boolean debug = false; - - /** - * Translates sequence seq into the list of all valid kmers paired - * with their offsets on that sequence. Valid kmer is a kmer that contains only ACGT bases; if seq - * contains any other symbols, no kmers overlapping with such symbols will be generated. This method - * returns a linear (possibly gapped if non-ACGT symbols are present) representation of the sequence as its kmers - * with corresponding offsets, NOT a lookup index. If a specific kmer occurs on the sequence N times, - * the returned list will have N occurences of this kmer, each paired with one unique location on the sequence. - * Kmers themselves are represented as integer kmer ids here, see #idToSeq() if string (ACGT bases) representation - * of kmers is needed. Empty list if returned if no valid kmers are found on the sequence (i.e. too many non-ACGT bases) - * - * @param K key (kmer) length - * @param seq sequence to translate into kmer/offset representation - * @return list of kmer/offsets - */ - public static List toKeyOffsetList(int K, byte seq[]) { - return toKeyOffsetList(K,seq,0,seq.length); - } - - /** Same as #toKeyOffsetList(int K, byte [] seq) (see docs), except that this method is not static and - * uses key length K associated with the specific instance of the KmerIndex class. - * @param seq - * @return - */ - public List toKeyOffsetList(byte [] seq) { - return toKeyOffsetList(this.K,seq); - } - - /** Returns an ordered sequence of overlapping (1-shift in the ideal case) k-mers of length K found in the subsequence of - * length length of sequence seq starting at position start. All returned kmers - * are fully subsumed by the interval [start, start+length) on the sequence seq (no partial overlaps). - * Each kmer is paired with its offset on the (full length) seq in the returned list. - * Note that only k-mers on the forward strand are returned. You need to manually rc the string and - * call toKeyOffsetList() again to get rc k-mers. If sequence contains any other symbols than ACGT, all k-mers - * that would overlap with those symbols will be skipped (not present in the returned list). See also - * #toKeyOffsetList(int K, byte [] seq) which translates the whole sequence seq. - * - * @param K index key (k-mer) length - * @param seq sequence to compute k-mers from - * @param start compute kmers for subsequence [start, start+length) of seq - * @param length compute kmers for subsequence [start, start+length) of seq - * @return a list of pairs (kmer,offset_on_the_seq) for each valid kmer (i.e. kmer that does not overlap with - * non-ACGT bases); if no valid kmers exist, the returned list will be empty. - */ - public static List toKeyOffsetList(int K, byte[] seq, int start, int length) { - - if ( length < K ) throw new StingException("Can not index sequence that is shorter than key length: total seq="+seq.length+"; start="+start+"; length="+length); - - int mask = 0; - if ( length > K ) { - for ( int i = 0; i < K ; i++ ) { - mask <<= 2; - mask |= 0x03; - } - } - - int key = 0; - int i ; - final int final_pos = start+length; // first base *after* the last position we want to index - - ArrayList l = new ArrayList(length-K+1); - - PrimitivePair.Int firstK = toFirstKey(K,seq,start,final_pos); - if ( firstK == null ) { - // ooops, too many non-ACGT bases, we were not able to find a single valid k-mer on the whole sequence! - return l; - } - - l.add(firstK); - - start = firstK.getSecond(); - i = start + K; // i points to the first base after the returned kmer firstK - key = firstK.getFirst(); - - // now let's try recomputing next kmers in an efficient way: we reuse previous kmer and add only the new last base. - // This will break if we encounter a non-ACGT base, in which case we will have to start over. - - for ( start++ ; i < final_pos ; i++, start++ ) { - int d = BaseUtils.simpleBaseToBaseIndex(seq[i]); - if ( d == -1 ) { - // ooops, we ran into a bad base; let's jump over it completely and reinitialize the key - // (since all kmers overlapping with the current base are invalid) - firstK = toFirstKey(K,seq,i+1,final_pos); - if ( firstK == null ) break; // no more valid kmers - l.add(firstK); - start = firstK.getSecond(); - i = start+K; // points to the base right after the Kmer we just found - key = firstK.getFirst(); // reset key to the new kmer we just found - } else { - // the base is good, so we can compute our new kmer very efficiently using the old one: - key <<= 2; - key &= mask; - key += d; - l.add(new PrimitivePair.Int(key,start)); - } - } - return l; - } - - /** Non-static version of #toKeyOffsetList(int K, byte [] seq, int start, int length) (see docs), which - * uses key length K associated with this instance of the KmerIndex object. - * @param seq - * @param start - * @param length - * @return - */ - public List toKeyOffsetList(byte[] seq, int start, int length) { - return toKeyOffsetList(this.K,seq,start,length); - } - - - /** Computes index (key) of the first valid kmer in the interval [start,stop) of the sequence seq. Kmer is valid - * if it contains only valid (ACGT) bases. Returns key and actual offset of first such kmer found, or null - * if such kmer does not exist (i.e. if seq does not contain a continuous span of ACGT bases at least K bases long). - * @param K - * @param seq - * @param start - * @param stop - * @return - */ - private static PrimitivePair.Int toFirstKey(int K, byte[] seq, int start, int stop) { - int d = -1; - int key = 0 ; - while ( d == -1 && start < stop - K + 1) { - key = 0; - for ( int i = start ; i < start+K; i++ ) { - key <<= 2; - d = BaseUtils.simpleBaseToBaseIndex(seq[i]); - if ( d == -1) { - // ooops, non-ACGT base found, abort and start over. Next kmer that - // have a chance to be valid (contain only ACGT bases) can start only after the current position: - start = i+1; - break; - } - key += d; - } - } // got the first key - - if ( d != -1 ) return new PrimitivePair.Int(key,start); - else return null; - } - - /** Creates an empty kmer index table with specified key length - * - * @param K - */ - public KmerIndex(final int K) { - if ( K > 16 ) throw new StingException("Lookup keys longer than 16 bases are currently not supported"); - if ( K % 2 == 0 ) throw new StingException("Even keys require additional processing of palindromes, currently not supported. Please use odd key."); - this.K = K; - - mask = 0; - for ( int i = 0; i < K; i++ ) { - mask <<= 2; - mask |= 0x03; - } // got the first key - - lookup = new HashMap>(); - } - - /** Builds kmer index table with key length K for the sequence seq. - * - * @param K - * @param seq - */ - public KmerIndex(final int K, final byte[] seq) { - this(K); - - if ( seq.length < K ) throw new StingException("Sequence is shorter than requested lookup index key length"); - - addToIndex(toKeyOffsetList(K,seq,0,seq.length)); - } - - public void setDebug(boolean d) { this.debug = d; } - - /** Clears current lookup index table completely (but preserves the key length previously set). - * - */ - public void clear() { lookup.clear(); } - - /** Builds complete index for the sequence seq. This method can be used only when lookup table is - * empty (i.e. use #clear() first), otherwise an exception will be thrown. - * @param seq - */ - public void index(final byte[] seq) { - if ( ! lookup.isEmpty() ) { - throw new StingException("Can not index new sequence: lookup table is already non-empty"); - } - addToIndex(toKeyOffsetList(K,seq,0,seq.length)); - } - - /** - * Updates existing index. It is assumed that the sequence that was already indexed by this KmerIndex object is - * the exact subsequence of length old_length of the new sequence seq, starting at - * position old_start. No checks are performed, so it is the responsibility of the caller to ensure - * that this is indeed the case, otherwise the index will be inconsistent. Since the old sequence is a part - * of the new one, this method will keep all the already computed kmers (and update their offsets as needed), - * and compute and add kmers/offsets for all the novel bases added to the sequence seq compared - * to the old, already indexed subsequnce. If old_length is less than - * K (i.e. old sequence could not be and was not indexed at all), the new sequence seq will - * be fully indexed from start to end. - * @param seq - * @param old_start already indexed subsequence starts at this position in seq - * @param old_length length of the already indexed subsequence - */ - public void updateIndex(final byte[] seq, final int old_start, final int old_length) { - - if ( old_length < K ) { - if ( ! lookup.isEmpty()) - throw new StingException("It is claimed that old indexed sequence is shorter than K (i.e. it could not be indexed), but index is non empty"); - addToIndex( toKeyOffsetList(K,seq,0,seq.length)); - return; - } - - if ( old_start > 0 ) { - // update positions of previously indexed k-mers: - for ( Map.Entry> e : lookup.entrySet() ) { - List l = e.getValue(); - for ( int i = 0 ; i < l.size(); i++ ) l.set(i,l.get(i)+old_start); - } - // take care of additional k-mers appearing *before* the already indexed subsequence: - // if already indexed subsequence starts at 'start', the first k-mer from that sequence - // ends at start+K-1 (inclusive) and it is obviously already indexed. So the last k-mer we want to index now ends at - // start+K-2 (inclusive), the length of [0,start+K-2] interval that we need to index is - // start+K-1. - addToIndex( toKeyOffsetList(K,seq,0,old_start+K-1) ); - } - - // the last k-mer we already indexed ends at start+length-1 (inclusive); so it starts at start+length-1-(K-1)=start+length-K. - // Hence, the first k-mer that is not indexed yet starts at start+length-K+1. The length of the subsequence that - // we need to index, [start+length-K+1,seq.length) is seq.length - start - length +K - 1 - - int pos = old_start+old_length-K+1; - addToIndex( toKeyOffsetList(K,seq,pos,seq.length-pos) ); - - - } - - /** Convenience shortcut: takes the list of keys/offsets and pushes offsets into the lookup index for the keys that - * do exist already, or first creates the new entry and then pushes the offset for keys that are novel. This method - * is quiet: if keys is null or an empty list, it does nothing. - * @param keys - */ - private void addToIndex(final List keys ) { - if ( keys == null ) return; - for ( PrimitivePair.Int key: keys ) { - List l = lookup.get(key.getFirst()); - if ( l == null ) { - l = new ArrayList(); - lookup.put(key.getFirst(),l); - } - l.add(key.getSecond()); - } - - } - - /** - * Converts kmer (integer key) of length K into its sequence representation. Returns a sequence (over ACGT alphabet) - * of length K that corresponds to the specified key. - * @param K - * @param kmer - * @return - */ - public static byte [] idToSeq(int K, int kmer) { - byte [] seq = new byte[K]; - for ( int i = K-1; i >=0 ; i-- ) { - seq[i] = BaseUtils.baseIndexToSimpleBase(kmer & 0x3); - kmer >>= 2; - } - return seq; - } - - /** Returns all offsets for the specified kmer (key) on the sequence indexed by this KmerIndex object. Returns - * null if specified kmer is not present on the indexed sequence. - * @param key - * @return - */ - public List getOffsets(int key) { return lookup.get(key); } -// public List getOffsets(byte[] seq) { -// if ( seq.length != K ) throw new StingException("Can not perform direct lookup of a sequence with length different from key size"); -// -// return getOffsets( toKey(seq) ) ; -// } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java deleted file mode 100644 index ce5dea0ec..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.utils; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.CigarElement; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; - -/** - * Created by IntelliJ IDEA. -* User: asivache -* Date: Aug 6, 2010 -* Time: 6:18:01 PM -* To change this template use File | Settings | File Templates. -*/ -public class ReadPair { - - public enum PairType { - UNKNOWN, - BOTH_UNMAPPED, - ONE_UNMAPPED, - PROPER, - LEFT, - RIGHT, - OUTER, - INTER - }; - - - private SAMRecord end1 = null; - private SAMRecord end2 = null; - private PairType pType = PairType.UNKNOWN; - private int leftStart = -1; - private int rightStart = -1; - private SAMRecord leftRead = null; - private SAMRecord rightRead = null; - - - - /** Creates an empty read pair object */ - public ReadPair() {} - - /** Creates a read pair objects initialized with the specified read */ - public ReadPair(SAMRecord read) { - addRead(read); - } - - /** Returns name of the paired read (it is assumed that both individual reads in the pair share same name). - * - * @return - */ - public String getName() { return ( end1 != null ? end1.getReadName() : (end2 != null ? end2.getReadName() : null) ); } - - /** Returns true if both ends are recorded in this read pair object. Note that because SAM records carry - * mate information, a pair can be (partially) initialized from one end. This method verifies that this is not the case - * and both records are actually present. - * @return - */ - public boolean hasBothEnds() { return end1 != null && end2 != null ; } - - /** Returns true if this pair object was initialized with at least one end. Since SAM records carry mate information, - * it is sometimes sufficient to have only one read (fragment end) actually recorded in the pair object, at which - * point some useful information can be retrieved for the pair already. - * @return - */ - public boolean hasAnyData() { return end1 != null || end2 != null ; } - - /** Returns true if both ends in the pair are mapped. The pair object must be at least partially initialized (i.e. - * it has to hold a reference to at least one end of the pair), otherwise an exception will be thrown. - * @return - */ - public boolean bothEndsMapped() { - if ( pType == PairType.UNKNOWN ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); - - if ( pType == PairType.BOTH_UNMAPPED || pType == PairType.ONE_UNMAPPED ) return false; - return true; - } - - /** Returns true if both ends in the pair are mapped uniquely. This method requires both ends being already registered - * in this pair object (i.e. hasBothEnds() is true), otherwise an exception will be thrown. - * @return - */ - public boolean bothEndsUniquelyMapped() { - if ( ! hasBothEnds() ) throw new StingException("Can not determine if both ends are uniquely mapped until both ends are recorded"); - return bothEndsMapped() && end1.getMappingQuality() > 0 && end2.getMappingQuality() > 0; - } - - /** Returns true if this pair is in proper orientation, i.e. ---> <--- on the same contig */ - public boolean isProper() { return pType == PairType.PROPER; } - - /* Returns true if this pair is in outer orientation, i.e. <--- ---> on the same chromosome */ - public boolean isOuter() { return pType == PairType.OUTER; } - - /** Returns left (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map - * onto the same contig, otherwise an exception will be thrown. - * @return - */ - public SAMRecord getLeftRead() { - if ( ! bothEndsMapped() || pType == PairType.INTER ) - throw new StingException("Left read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName()); - if ( leftRead == null ) - throw new StingException("Left read is not recorded. Maybe we have not seen it yet? Pair: "+getName()); - return leftRead; - } - - /** Returns right (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map - * onto the same contig, otherwise an exception will be thrown. - * @return - */ - public SAMRecord getRightRead() { - if ( ! bothEndsMapped() || pType == PairType.INTER ) - throw new StingException("Right read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName()); - if ( rightRead == null ) - throw new StingException("Right read is not recorded. Maybe we have not seen it yet? Pair: "+getName()); - return rightRead; - } - - public SAMRecord getEnd1() { return end1; } - public SAMRecord getEnd2() { return end2; } - - public PairType getPairType() { return pType ; } - - public void addRead(SAMRecord r) { - if ( ! r.getReadPairedFlag() ) throw new StingException("Read "+r.getReadName() +" is unpaired"); - if ( r.getFirstOfPairFlag() ) { - if ( end1 != null ) throw new StingException("Read "+r.getReadName()+" is first of pair and the pair already has first read recorded"); - end1 = r; - if ( end2 != null && ! end1.getReadName().equals(end2.getReadName()) ) - throw new StingException("The pair already has read "+end2.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" ); - } else { - if ( r.getSecondOfPairFlag() ) { - if ( end2 != null ) throw new StingException("Read "+r.getReadName()+" is second of pair and the pair already has second read recorded"); - end2 = r; - if ( end1 != null && ! end1.getReadName().equals(end2.getReadName()) ) - throw new StingException("The pair already has read "+end1.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" ); - } else { - throw new StingException("The read "+r.getReadName()+" is marked as paired, but the first/second of pair flag is not set"); - } - } - setPairInfo(r); - } - - /** If pair type has not been set yet, then sets it to t. Otherwise (pair type already set), - * just checks if the pair type is t. If it is, the method returns quietly; if it is not (inconsistency detected), - * throws an exception. - * - */ - private void setCheckPairType(PairType t) { - if ( pType != PairType.UNKNOWN ) { - if ( pType != t ) - throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); - } else pType = t; - } - - private void setCheckLeftStart(int pos) { - if ( leftStart >= 0 ) { - if ( leftStart != pos ) - throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); - } else leftStart = pos; - } - - private void setCheckRightStart(int pos) { - if ( rightStart >= 0 ) { - if ( rightStart != pos ) - throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); - } else rightStart = pos; - } - - private void setPairInfo(SAMRecord read) { - - setCheckPairType(getPairType(read)); - - // there is nothing left to do unless both ends are mapped onto the same contig: - if ( pType == PairType.INTER ) return; - - if ( pType == PairType.ONE_UNMAPPED ) { - // set putative left or right read depending on the orientation of the only mapped mate - if ( ! AlignmentUtils.isReadUnmapped(read ) ) { - // we can set left/right read only if it is the current read that is mapped; if we have the - // unmapped mate, skip and wait for the mapped read to come! - if ( read.getReadNegativeStrandFlag() ) { - setCheckRightStart(read.getAlignmentStart()); - if ( rightRead != null ) throw new StingException("Right read was already set for the pair"); - rightRead = read; - } else { - setCheckLeftStart(read.getAlignmentStart()); - if ( leftRead != null ) throw new StingException("Left read was already set for the pair"); - leftRead = read; - } - } - return; - } - - // we are here if both ends are mapped and they map onto the same contig - if ( read.getAlignmentStart() < read.getMateAlignmentStart() ) { //left/right = read/mate - - setCheckLeftStart(read.getAlignmentStart()); - setCheckRightStart(read.getMateAlignmentStart()); - - if ( leftRead != null ) throw new StingException("Left read was already set for the pair"); - leftRead = read; - } else { - // left/right = mate/read - - setCheckLeftStart(read.getMateAlignmentStart()); - setCheckRightStart(read.getAlignmentStart()); - - if ( rightRead != null ) throw new StingException("Right read was already set for the pair"); - rightRead = read; - } - } - - /** Returns pair type that describes this read and its mate. The alignment information for both the read itself - * and its mate is taken from the read's sam record passed as the argument, so the mate information is expected to be - * correctly set! - * @param read - * @return - */ - public static PairType getPairType(SAMRecord read) { - - if ( AlignmentUtils.isReadUnmapped(read) ) { - if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.BOTH_UNMAPPED; - else return PairType.ONE_UNMAPPED; - } - - return getWouldBePairType(read,read.getReferenceIndex(),read.getAlignmentStart(),read.getReadNegativeStrandFlag()); - } - - /** Returns pair type that would describe this read and its mate, if this read mapped onto refId:start in orientation - * given by rc (forward is rc=false, reverse is rc=true). The read's alignment information (if any, - * unmapped reads are allowed) present in the SAM record is completely ignored by this method, - * only mate's information is used. - * @param read - * @param refId - * @param start - * @param rc - * @return - */ - public static PairType getWouldBePairType(SAMRecord read, int refId, int start, boolean rc) { - - - if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.ONE_UNMAPPED ; - - // both read and mate are mapped: - - if ( refId != read.getMateReferenceIndex() ) return PairType.INTER; - - // both read and its mate map onto the same chromosome - - if ( start < read.getMateAlignmentStart() ) { //left/right = read/mate - - if ( rc ) { - if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT; - else return PairType.OUTER; - } else { - if ( read.getMateNegativeStrandFlag() ) return PairType.PROPER; - else return PairType.RIGHT; - } - } else { - // left/right = mate/read - - if ( rc ) { - if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT; - else return PairType.PROPER; - } else { - if ( read.getMateNegativeStrandFlag() ) return PairType.OUTER; - else return PairType.RIGHT; - } - } - } - - public int getLeftStart() { - if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); - return leftStart; - } - - public int getRightStart() { - if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); - return rightStart; - } - - public int getFragmentSize() { - if ( ! hasBothEnds() ) throw new StingException("Can not determine fragment size: pair object does not have both ends yet"); - if ( ! bothEndsMapped() ) throw new StingException("Can not determine fragment size: both ends must be mapped"); - if ( pType != PairType.PROPER ) throw new StingException("The pais is not in proper orientation, can not determine fragment size"); - - return getFragmentSize(leftRead,rightRead); - } - - /** Given a read (that must belong to this pair), returns the other end in the pair if it is already - * recorded, or null otherwise. - * @param read - * @return - */ - public SAMRecord getOtherEnd(SAMRecord read) { - if ( read.getFirstOfPairFlag() ) return end2; - else { - if ( read.getSecondOfPairFlag() ) return end1; - } - return null; - } - - public static int getFragmentSize(SAMRecord left, SAMRecord right) { - - if ( left == null || right == null || - AlignmentUtils.isReadUnmapped(left) || AlignmentUtils.isReadUnmapped(right) ) { - throw new StingException("No read (null) or unmapped read provided: fragment size is not defined"); - } - if ( left.getReferenceIndex() != right.getReferenceIndex() ) { - throw new StingException("Left/right reads map onto different contigs: fragment size is not defined"); - } - - int fragment_length = left.getReadLength(); // fragment is at least as long as the left read, duh! - int leftEnd = left.getAlignmentEnd(); - int rightStart = right.getAlignmentStart(); - - if ( rightStart > leftEnd ) { - // if reads are not overlapping, fragment length is lengths of both reads plus the distance (gap) between - // the reads. Note that if the sequence between the reads happens to have insirtions or deletions, - // our estimation of the actual distance between the reads (on the fragment) is incorrect, but we - // can not do better given just those reads. This estimation is, in particular, incorrect - // for left reads ending with 'I' and/or right reads starting with 'I' - // - // left right - // -------->...gap...<-------- fragment = left+gap+right - - return left.getReadLength() + right.getReadLength() + (rightStart - leftEnd-1); - } - - // if we are here, the reads do overlap; fragment length is lengths of the two reads less the overlap. - // in this case we can compute the actual overlap between the reads (on the fragment) taking into - // account indels, if any - // - // left **** right - // ------------> ****=overlap; fragment = left+right - overlap - // <-------------- - // - // with deletion: - // - // left ** ** right - // -----------ddd-> ****=overlap; fragment = left+right - overlap - // <-ddd------------- note that overlap != leftEnd - rightStart+1 - // instead, overlap = leftEnd-rightStart+1- length(D) - // with insertion: - // - // left ******* right ******* = overlap; fragment = left+right - overlap - // -------------iii-> note that overlap != leftEnd - rightStart +1 - // <-iii-------------- instead, overlap = leftEnd - rightStart +1 + length(I) - // (since 'i' bases are NOT on the ref) - - int posOnRef = rightStart; -// int posOnRightRead = 0; - - int overlap = leftEnd - rightStart + 1 ; - - for(CigarElement ce : left.getCigar().getCigarElements() ) { - switch(ce.getOperator()) { - case S: - case H: -// posOnRightRead+=ce.getLength(); - break; - case I: - overlap += ce.getLength(); - break; - case D: - case N: - overlap -= ce.getLength(); - case M: - posOnRef += ce.getLength(); - break; - default: - } - if ( posOnRef > leftEnd ) break; // we need to examine only overlapping part of the reads - } - return left.getReadLength() + right.getReadLength() - overlap; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AffectedConsistencyWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AffectedConsistencyWalker.java deleted file mode 100755 index 4100b500d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AffectedConsistencyWalker.java +++ /dev/null @@ -1,307 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Given a VCF file and one or more scenarios for affected individuals, calculates the probability that a given site's genotypes - * are consistent with the expected pattern for a given disease model. - */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) -public class AffectedConsistencyWalker extends RodWalker { - public enum DiseaseModel { DOMINANT, RECESSIVE } - - @Output - public PrintStream out; - - @Argument(fullName="affected", shortName="A", doc="A scenario file (or files) for affected individuals. Scenarios are specified with an identifier and a comma-separated list of samples (e.g. Pedigree_1 sample1,sample2,sample3). Each line is another scenario.", required=true) - public String[] AFFECTED_SAMPLE_SCENARIOS; - - @Argument(fullName="diseaseModel", shortName="DM", doc="The disease model (DOMINANT or RECESSIVE)", required=true) - public DiseaseModel DISEASE_MODEL; - - @Argument(fullName="verbose", shortName="V", doc="If specified, enable verbose mode with a lot of output useful for debugging", required=false) - public PrintStream VERBOSE_WRITER = null; - - public Map> sampleScenarios; - public GATKReport consistencyReport; - public Set availableSamples; - - private Map> loadAffectedSampleScenarios() { - // Load all the specified sample scenarios specified in one or more files - ArrayList scenarioStrings = new ArrayList(); - - for (String affectedSampleScenario : AFFECTED_SAMPLE_SCENARIOS) { - File affectedSampleScenarioFile = new File(affectedSampleScenario); - - try { - XReadLines lineReader = new XReadLines(affectedSampleScenarioFile); - - for (String line : lineReader) { - // Ignore commented-out lines - if (!line.contains("#")) { - scenarioStrings.add(line); - } - } - } catch (FileNotFoundException e) { - throw new UserException(String.format("The scenario file '%s' was not found", affectedSampleScenarioFile.getAbsolutePath())); - } - } - - // Parse all the sample scenario strings (comma- or white-space-separated sample lists) - Map> scenarios = new HashMap>(); - - for (String scenarioString : scenarioStrings) { - String[] pieces = scenarioString.split("[\\s]+"); - - if (pieces.length != 2) { - throw new UserException( - String.format("The scenario line '%s' could not be understood. Please make sure that your " + - "scenario file has only two columns: the first being an arbitrary scenario id " + - "(e.g. 'Pedigree_1') and the second being a comma-separated list of samples " + - "(e.g. 'sample1,sample2,sample3')", - scenarioString - ) - ); - } - - String scenarioId = pieces[0]; - - String[] sampleNames = pieces[1].split(","); - - Set samples = new HashSet(); - for (String sample : sampleNames) { - if (!availableSamples.contains(sample)) { - throw new UserException( - String.format("The sample '%s' was not found in the ROD bound as the 'variant' track " + - "(i.e. the file that was supplied via '-B:variant,VCF /path/to/my.vcf'). " + - "Please make sure all samples specified for processing are present in " + - "your VCF.", - sample) - ); - } else { - samples.add(sample); - } - } - - scenarios.put(scenarioId, samples); - } - - if (scenarios.size() == 0) { - throw new UserException("There were no scenarios specified. Please specify at least one set of affected samples."); - } - - return scenarios; - } - - public void initialize() { - // Figure out what samples I can possibly have (from the bound VCF file) - ArrayList rodNames = new ArrayList(); - rodNames.add("variant"); - - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - availableSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - - // Load the scenarios to consider - sampleScenarios = loadAffectedSampleScenarios(); - - // Prepare the output report - consistencyReport = new GATKReport(); - consistencyReport.addTable("AffectedConsistency", "Table of results indicating if the observed genotypes matched the expected genotypes"); - - GATKReportTable table = consistencyReport.getTable("AffectedConsistency"); - table.addPrimaryKey("locus_and_scenario", false); - table.addColumn("chr", "unknown"); - table.addColumn("start", 0); - table.addColumn("scenario", "unknown"); - table.addColumn("P_of_C_given_DM_is_true", 0.0); - table.addColumn("P_of_C_given_DM_is_false", 0.0); - table.addColumn("OR_DM_is_true_vs_DM_is_false", 0.0); - - for ( String sample : availableSamples ) { - table.addColumn(sample, "unknown"); - } - - if (VERBOSE_WRITER != null) { - VERBOSE_WRITER.println("This is a test of the verbose writer"); - } - - System.exit(0); - } - - private VariantContext getExpectedGenotypeConfiguration(Set affectedSamples, VariantContext obs) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(obs.getReference()); - homRefAlleles.add(obs.getReference()); - - List hetAlleles = new ArrayList(); - hetAlleles.add(obs.getReference()); - hetAlleles.add(obs.getAlternateAllele(0)); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(obs.getAlternateAllele(0)); - homVarAlleles.add(obs.getAlternateAllele(0)); - - Collection expectedGenotypes = new ArrayList(); - for ( String sample : obs.getSampleNames() ) { - Genotype expectedGenotype = new Genotype(sample, homRefAlleles); - if (affectedSamples.contains(sample)) { - expectedGenotype = (DISEASE_MODEL == DiseaseModel.DOMINANT) ? new Genotype(sample, hetAlleles) : new Genotype(sample, homVarAlleles); - } - - expectedGenotypes.add(expectedGenotype); - } - - return new VariantContext("expected", obs.getChr(), obs.getStart(), obs.getEnd(), obs.getAlleles(), expectedGenotypes); - } - - private double getLogLikelihoodOfDiseaseModelHypothesis(VariantContext obs, VariantContext exp, boolean diseaseModelIsSupported) { - return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, 0, 0.0); - } - - private double getLogLikelihoodOfDiseaseModelHypothesis(VariantContext obs, VariantContext exp, boolean diseaseModelIsSupported, int sampleIndex, double logLikelihoodSoFar) { - if (sampleIndex < exp.getNSamples()) { - Genotype expGenotype = exp.getGenotype(sampleIndex); - Genotype obsGenotype = obs.getGenotype(sampleIndex); - - if (obsGenotype.hasLikelihoods()) { - double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(obsGenotype.getLikelihoods().getAsVector()); - boolean[] expectedGenotypes = { expGenotype.isHomRef(), expGenotype.isHet(), expGenotype.isHomVar() }; - - for (int i = 0; i < 3; i++) { - if (expectedGenotypes[i] == diseaseModelIsSupported) { - return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, sampleIndex + 1, logLikelihoodSoFar + Math.log10(normalizedLikelihoods[i])); - } - } - } else { - return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, sampleIndex + 1, logLikelihoodSoFar); - } - } - - return logLikelihoodSoFar; - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker != null) { - Collection vcs = tracker.getAllVariantContexts(ref, null, ref.getLocus(), true, true); - - if (vcs.size() == 1) { - VariantContext obs = vcs.iterator().next(); - - for (String scenarioId : sampleScenarios.keySet()) { - Set affectedSamples = sampleScenarios.get(scenarioId); - - VariantContext exp = getExpectedGenotypeConfiguration(affectedSamples, obs); - - /* - GATKReport report = new GATKReport(); - - String reportName = String.format("GenotypeTable_%s_%s_%s", scenarioId, ref.getLocus().getContig(), ref.getLocus().getStart()); - String reportDesc = String.format("Info for scenario %s at locus %s", scenarioId, ref.getLocus()); - report.addTable(reportName, reportDesc); - - GATKReportTable table = report.getTable(reportName); - - table.addPrimaryKey("sample_pk", false); - table.addColumn("table", "unknown"); - table.addColumn("sample", "unknown"); - table.addColumn("affected", false); - table.addColumn("homref_prob", "unknown"); - table.addColumn("het_prob", "unknown"); - table.addColumn("homvar_prob", "unknown"); - table.addColumn("observed_genotype", "unknown"); - table.addColumn("expected_genotype", "unknown"); - - for (String sample : obs.getSampleNames()) { - double[] normalizedLikelihoods = {0.0, 0.0, 0.0}; - if (obs.getGenotype(sample).hasLikelihoods()) { - normalizedLikelihoods = MathUtils.normalizeFromLog10(obs.getGenotype(sample).getLikelihoods().getAsVector()); - } - - table.set(sample, "table", reportName); - table.set(sample, "sample", sample); - table.set(sample, "affected", affectedSamples.contains(sample)); - table.set(sample, "homref_prob", normalizedLikelihoods[0]); - table.set(sample, "het_prob", normalizedLikelihoods[1]); - table.set(sample, "homvar_prob", normalizedLikelihoods[2]); - table.set(sample, "observed_genotype", obs.getGenotype(sample).getGenotypeString()); - table.set(sample, "expected_genotype", exp.getGenotype(sample).getGenotypeString()); - } - - report.print(out); - */ - - double logLikelihoodThatDiseaseModelIsSupported = getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, true); - double logLikelihoodThatDiseaseModelIsNotSupported = getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, false); - double logOddsRatioThatDiseaseModelIsSupported = logLikelihoodThatDiseaseModelIsSupported / logLikelihoodThatDiseaseModelIsNotSupported; - - String key = String.format("%s_%s_%s", ref.getLocus().getContig(), ref.getLocus().getStart(), scenarioId); - - consistencyReport.getTable("AffectedConsistency").set(key, "scenario", scenarioId); - consistencyReport.getTable("AffectedConsistency").set(key, "chr", ref.getLocus().getContig()); - consistencyReport.getTable("AffectedConsistency").set(key, "start", ref.getLocus().getStart()); - consistencyReport.getTable("AffectedConsistency").set(key, "P_of_C_given_DM_is_true", logLikelihoodThatDiseaseModelIsSupported); - consistencyReport.getTable("AffectedConsistency").set(key, "P_of_C_given_DM_is_false", logLikelihoodThatDiseaseModelIsNotSupported); - consistencyReport.getTable("AffectedConsistency").set(key, "OR_DM_is_true_vs_DM_is_false", logOddsRatioThatDiseaseModelIsSupported); - - for ( String sample : availableSamples ) { - String obsAndExpectedGenotypes = String.format("%s;%s", obs.getGenotype(sample).getGenotypeString(), exp.getGenotype(sample).getGenotypeString()); - consistencyReport.getTable("AffectedConsistency").set(key, sample, obsAndExpectedGenotypes); - } - } - } - } - - return null; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - @Override - public Integer reduceInit() { - return null; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return null; - } - - public void onTraversalDone(Integer result) { - consistencyReport.print(out); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlignedReadsHistoWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlignedReadsHistoWalker.java deleted file mode 100755 index 0d4d5ab62..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlignedReadsHistoWalker.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Output; - -import java.io.PrintStream; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 3:22:14 PM - * To change this template use File | Settings | File Templates. - */ -@WalkerName("Aligned_Reads_Histogram") -public class AlignedReadsHistoWalker extends ReadWalker { - @Output - PrintStream out; - - long[] alignCounts = new long[51]; - - public void initialize() { - for ( int i = 0; i < alignCounts.length; i++ ) { - alignCounts[i] = 0; - } - } - - // Do we actually want to operate on the context? - public boolean filter(byte[] ref, SAMRecord read) { - // we only want aligned reads - return !read.getReadUnmappedFlag(); - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - //System.out.println(read.getAttribute("NM")); - int editDist = Integer.parseInt(read.getAttribute("NM").toString()); - if (editDist <= 50) - alignCounts[editDist]++; - return 1; - } - - // Given result of map function - public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer result) { - int curTotal = 0; - for ( int i = 0; i < alignCounts.length; i++ ) { - curTotal += alignCounts[i]; - out.printf("%3d %10d%n", i, curTotal); - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlleleBalanceHistogramWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlleleBalanceHistogramWalker.java deleted file mode 100644 index 509d65441..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AlleleBalanceHistogramWalker.java +++ /dev/null @@ -1,119 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.commandline.Output; - -import java.util.*; -import java.io.PrintStream; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Jan 26, 2010 - * Time: 3:25:11 PM - * To change this template use File | Settings | File Templates. - */ -@Requires(value= DataSource.REFERENCE,referenceMetaData = {@RMD(name="variants",type=ReferenceOrderedDatum.class)}) -public class AlleleBalanceHistogramWalker extends LocusWalker, Map>> { - @Output - PrintStream out; - - public Map> reduceInit() { - return new HashMap>(); - } - - public Map> reduce(Map alleleBalances, Map> aggregateBalances ) { - if ( alleleBalances != null ) { - for ( String name : alleleBalances.keySet() ) { - if ( alleleBalances.get(name) != null ) { - if ( aggregateBalances.get(name) != null ) { - aggregateBalances.get(name).add(alleleBalances.get(name)); - } else { - aggregateBalances.put(name,new HashSet( Arrays.asList(alleleBalances.get(name) ) ) ); - } - } - } - } - - return aggregateBalances; - } - - public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc = tracker.getVariantContext(ref, "variants", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), false); - if ( vc == null || !vc.isBiallelic() ) { - return null; - } - - return getAlleleBalanceBySample(vc,ref,context); - } - - public void onTraversalDone(Map> finalSets) { - for ( String s : finalSets.keySet() ) { - StringBuilder output = new StringBuilder(); - output.append(String.format("%s",s)); - for ( double d : finalSets.get(s) ) { - output.append(String.format("\t%.2f",d)); - } - out.print(String.format("%s%n",output)); - } - } - - private HashMap getAlleleBalanceBySample(VariantContext vc, ReferenceContext ref, AlignmentContext context) { - Map sampleContext = AlignmentContextUtils.splitContextBySampleName(context); - HashMap balances = new HashMap(); - System.out.println("----- "+ref.getLocus()+" -----"); - int returnedBalances = 0; - for ( String sample : vc.getSampleNames() ) { - Double balance = getAlleleBalance(ref,sampleContext.get(sample),(char)vc.getAlternateAllele(0).getBases()[0]); - balances.put(sample, balance); - if ( balance != null ) { - returnedBalances++; - System.out.println(sample+"\t"+getCoverage(sampleContext.get(sample))); - } - } - - return balances; - } - - private long getCoverage(AlignmentContext context) { - return context.size(); - } - - private Double getAlleleBalance(ReferenceContext ref, AlignmentContext alicon, char snpBase) { - if ( alicon == null ) { - //System.out.println("Stratified context was null"); - return null; - } - - int refBases = 0; - int altBases = 0; - - for ( PileupElement e : alicon.getBasePileup() ) { - if ( BaseUtils.basesAreEqual( e.getBase(), ref.getBase() ) ) { - refBases++; - } else if ( BaseUtils.basesAreEqual(e.getBase(), (byte) snpBase ) ) { - altBases++; - } - } - - if ( refBases > 0 || altBases > 0) { - return ( ( double ) altBases ) / ( ( double ) altBases + ( double ) refBases ); - } else { - System.out.println("No ref or alt bases in pileup"); - return null; - } - } - - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotateTruthROD.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotateTruthROD.java deleted file mode 100755 index 3fac966ea..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotateTruthROD.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; - -/** - * Changes annotation in the truth dataset from the filter field to the INFO field. - * - * @author carneiro - * @since Mar 15, 2011 - * @help.summary Changes annotation in the truth dataset from the filter field to the INFO field. - */ - -public class AnnotateTruthROD extends RodWalker { - - @Output(doc="File to which validated variants should be written", required=true) - protected VCFWriter vcfWriter = null; - - List rodNames; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - - List rodList = getToolkit().getRodDataSources(); - rodNames = new ArrayList(); - - // Initialize VCF header - Set headerLines = null; - Set samples = null; - for (ReferenceOrderedDataSource rod : rodList) { - rodNames.add(rod.getName()); - Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), rod.getName()); - samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); - headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); - } - if (headerLines == null|| samples == null) - throw new UserException.BadInput("You need to provide at least one ROD file to annotate"); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - int linesWritten = 0; - // For some reason RodWalkers get map calls with null trackers - if( tracker == null ) - return linesWritten; - - for (String rod : rodNames) { - VariantContext vc = tracker.getVariantContext(ref, rod, null, context.getLocation(), false); - if (!isInsideExtendedIndel(vc, ref)) { - MutableVariantContext mvc = new MutableVariantContext(vc); - mvc.putAttribute("GV", vc.getFilters().contains("TP") ? "T":"F"); - mvc.clearFilters(); - vcfWriter.add(mvc, ref.getBase()); - linesWritten++; - } - - } - return linesWritten; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - public Integer reduceInit() { - return 0; - } - - public Integer reduce( Integer mapValue, Integer reduceSum ) { - return reduceSum + mapValue; - } - - public void onTraversalDone( Integer reduceSum ) { - logger.info(reduceSum + " lines written."); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotationByAlleleFrequencyWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotationByAlleleFrequencyWalker.java deleted file mode 100755 index 40bec19f6..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AnnotationByAlleleFrequencyWalker.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; - -import java.util.*; - -public class AnnotationByAlleleFrequencyWalker extends RodWalker { - - /* Get selected annotations from a given VCF File. If these annotations match a site reference ROD, output the annotation value. - Usage example: - java -jar dist/GenomeAnalysisTK.jar -R reference.fasta \ - -T AnnotationByAlleleFrequency \ - -A QualByDepth \ - -B eval,VCF,eval.vcf \ - -B HapMap,VCF,ref.vcf - - - */ - ///////////////////////////// - // Command Line Arguments - ///////////////////////////// - - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected String[] annotationsToUse = {}; - - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected String[] annotationClassesToUse = { }; - - private List requestedInfoAnnotations; - private List requestedGenotypeAnnotations; - - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - List annotationClasses = Arrays.asList(annotationClassesToUse); - List annotations = Arrays.asList(annotationsToUse); - AnnotationInterfaceManager.validateAnnotations(annotationClasses, annotations); - requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationClasses, annotations); - requestedGenotypeAnnotations = AnnotationInterfaceManager.createGenotypeAnnotations(annotationClasses, annotations); - } - - private static ArrayList getInstances(List> classes) { - ArrayList objects = new ArrayList(); - for ( Class c : classes ) - objects.add((T)getInstance(c)); - return objects; - } - - private static T getInstance(Class c) { - try { - return c.newInstance(); - } catch (Exception e) { - throw new DynamicClassResolutionException(c, e); - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - if( tracker != null ) { - EnumSet vc = EnumSet.of(VariantContext.Type.SNP); - GenomeLoc loc = context.getLocation(); - VariantContext vc_eval; - VariantContext vc_ref; - - - try { - vc_eval = tracker.getVariantContext(ref,"eval", vc, loc, true); - vc_ref = tracker.getVariantContext(ref,"HapMap", vc, loc, true); - } catch (java.util.NoSuchElementException e) { - return 0; - } - - if (vc_ref == null || vc_eval == null) { - return 0; - } - - // Get Allele frequency for reference ROD - double af_ref = Double.valueOf(2*vc_ref.getHomVarCount()+vc_ref.getHetCount()); - af_ref = af_ref / (vc_ref.getChromosomeCount()); - System.out.format("AF_Ref: %5.4f ", af_ref); - - // Get Allele frequency for eval ROD - double af_eval = Double.valueOf(2*vc_eval.getHomVarCount()+vc_eval.getHetCount()); - af_eval = af_eval / (vc_eval.getChromosomeCount()); - System.out.format("AF_Eval: %5.4f ", af_eval); - - -/* - String qq = vc_eval.getAttributeAsString("AF"); - System.out.format("AF_EvalVCF: %s ", qq); - */ - - //go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) - { - - String key = annotationType.getKeyNames().get(0); - String value_str = vc_eval.getAttributeAsString(key); - System.out.format("%s: %s ", key, value_str); - } - System.out.println(); - - } - - return 1; // This value isn't actually used for anything - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - public Integer reduceInit() { - return 0; // Nothing to do here - } - - public Integer reduce( Integer value, Integer sum ) { - return 0; // Nothing to do here - } - - public void onTraversalDone( Integer sum ) { - - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessLikelihoodsAtTruth.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessLikelihoodsAtTruth.java deleted file mode 100755 index 57d143127..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessLikelihoodsAtTruth.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.*; - -/** - * Assesses GLs at truth sites. - * Use -B:variant,vcf and -B:truth,vcf - */ -public class AssessLikelihoodsAtTruth extends RodWalker { - - private int[] nonErrors = new int[101]; - private int[] observations = new int[101]; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext variant = tracker.getVariantContext(ref, "variant", null, context.getLocation(), true); - if ( variant == null ) - return 0; - - VariantContext truth = tracker.getVariantContext(ref, "truth", null, context.getLocation(), true); - if ( truth == null ) - return 0; - - for ( Map.Entry GLgenotypeEntry : variant.getGenotypes().entrySet() ) { - Genotype GLgenotype = GLgenotypeEntry.getValue(); - if ( GLgenotype.isNoCall() ) - continue; - - if ( !truth.hasGenotype(GLgenotypeEntry.getKey()) ) - continue; - - Genotype truthGenotype = truth.getGenotype(GLgenotypeEntry.getKey()); - if ( truthGenotype.isNoCall() ) - continue; - - GenotypeLikelihoods GLs = GLgenotype.getLikelihoods(); - if ( GLs == null ) { - logger.warn("There are no GLs at " + context.getLocation()); - continue; - } - - double[] normalizedGLs = MathUtils.normalizeFromLog10(GLs.getAsVector()); - double myGL = GLgenotype.isHomRef() ? normalizedGLs[0] : (GLgenotype.isHet() ? normalizedGLs[1] : normalizedGLs[2]); - int roundedGL = (int)Math.round(100.0 * myGL); - - observations[roundedGL]++; - boolean correctGenotype = GLgenotype.getType().equals(truthGenotype.getType()); - if ( correctGenotype ) - nonErrors[roundedGL]++; - } - - return 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - System.out.println("GL_probability\tone_minus_error_rate\tobservations"); - for (int i = 0; i < 101; i++) { - if ( observations[i] > 0 ) - System.out.println(String.format("%.2f\t%.2f\t%d", (double)i/100.0, (double)nonErrors[i]/(double)observations[i], observations[i])); - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessMissingBroadCalls.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessMissingBroadCalls.java deleted file mode 100755 index 4478bd501..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessMissingBroadCalls.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - -/** - * Assesses calls missing from the BI set that are made by all other centers. - * Use -B:broad,vcf and -B:1kg,vcf - */ -@Reference(window=@Window(start=-50,stop=50)) -@Requires(value={}) -public class AssessMissingBroadCalls extends RodWalker { - - private static final String status_key = "BI_STATUS"; - private static final String qual_key = "BI_QUAL"; - - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - public void initialize() { - final ArrayList inputNames = new ArrayList(); - inputNames.add("1kg"); - - // setup the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); - writer.writeHeader(new VCFHeader(hInfo, new HashSet())); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - List kgRods = tracker.getReferenceMetaData("1kg"); - // ignore places where we don't have a variant - if ( kgRods.size() == 0 ) - return 0; - - VariantContext vc = (VariantContext)kgRods.get(0); - Map attrs = new HashMap(vc.getAttributes()); - - List biRods = tracker.getReferenceMetaData("broad"); - if ( biRods.size() == 0 ) - attrs.put(status_key, "NotCalled"); - else { - VariantContext BIvc = (VariantContext)biRods.get(0); - // skip the site if we called it - if ( !BIvc.isFiltered() ) - return 0; - - attrs.put(qual_key, BIvc.getPhredScaledQual()); - - Set filters = BIvc.getFilters(); - StringBuilder sb = new StringBuilder(); - for ( String filter : filters ) { - if ( sb.length() != 0 ) - sb.append("-"); - sb.append(filter); - } - attrs.put(status_key, sb.toString()); - } - - vc = VariantContext.modifyAttributes(vc, attrs); - writer.add(vc, ref.getBase()); - - return 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) {} -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSimulatedPerformance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSimulatedPerformance.java deleted file mode 100755 index 47bd82d2f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSimulatedPerformance.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.variantutils.VariantsToTable; -import org.broadinstitute.sting.utils.Utils; - -import java.io.PrintStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value={}) -public class AssessSimulatedPerformance extends RodWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(fullName="fields", shortName="F", doc="Fields to emit from the calls VCF", required=false) - public String FIELDS = "CHROM,POS,REF,ALT,QUAL,AC,AN,DP,Q,MODE"; - - @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) - public int MAX_RECORDS = -1; - int nRecords = 0; - - private List fieldsToTake; - - public void initialize() { - fieldsToTake = Arrays.asList(FIELDS.split(",")); - - for ( String source : Arrays.asList("sim", "called")) { - out.print(source + "." + Utils.join("\t" + source + ".", fieldsToTake)); - out.print("\t"); - } - out.println(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) { - printVCFields("sim", tracker, ref, context); - printVCFields("called", tracker, ref, context); - out.println(); - return 1; - } else { - if ( nRecords >= MAX_RECORDS ) { - logger.warn("Calling sys exit to leave after " + nRecords + " records"); - System.exit(0); // todo -- what's the recommend way to abort like this? - } - return 0; - } - } - - private void printVCFields(String name, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc = tracker.getVariantContext(ref, name, null, context.getLocation(), true); - out.print(Utils.join("\t", VariantsToTable.extractFields(vc, fieldsToTake, true))); - out.print("\t"); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) {} -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSnpsNearIndels.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSnpsNearIndels.java deleted file mode 100644 index b7aa1defe..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/AssessSnpsNearIndels.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.io.PrintStream; -import java.util.*; - -/** - * Assesses distance of SNP calls to nearest indel call in Exomes. - * Use -B:snps,vcf and -B:indels,vcf - */ -public class AssessSnpsNearIndels extends RodWalker { - - private class TiTv { - int TiCount = 0, TvCount = 0; - - public TiTv() {}; - } - - private static final int Bin0to5 = 0; - private static final int Bin6to10 = 1; - private static final int Bin11to15 = 2; - private static final int Bin16to20 = 3; - private static final int Bin21to25 = 4; - private static final int Bin26to30 = 5; - private static final int BinMoreThan30 = 6; - - private GenomeLoc previousIndel = null; - private ArrayList snpQueue = new ArrayList(); - private TiTv[] counts = new TiTv[7]; - private GenomeLocParser GLparser = null; - - @Output(doc="File to which results should be written",required=true) - protected PrintStream out = null; - - public void initialize() { - GLparser = getToolkit().getGenomeLocParser(); - - for (int i = 0; i < 7; i++) - counts[i] = new TiTv(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext snp = tracker.getVariantContext(ref, "snps", null, ref.getLocus(), true); - VariantContext indel = tracker.getVariantContext(ref, "indels", null, ref.getLocus(), true); - - // first add the snp if available - if ( snp != null && !snp.isFiltered() ) { - - // flush the queue on a new contig - if ( !snpQueue.isEmpty() && !snpQueue.get(0).getChr().equals(snp.getChr()) ) { - for ( VariantContext vc : snpQueue ) - calculateDistance(vc, previousIndel, null); - snpQueue.clear(); - } - - snpQueue.add(snp); - } - - // then look for the indel - if ( indel != null && !indel.isFiltered() ) { - - GenomeLoc loc = GLparser.createGenomeLoc(indel.getChr(), indel.getStart(), indel.getEnd()); - boolean sameContig = !snpQueue.isEmpty() && snpQueue.get(0).getChr().equals(indel.getChr()); - - // flush the queue - for ( VariantContext vc : snpQueue ) - calculateDistance(vc, previousIndel, sameContig ? loc : null); - snpQueue.clear(); - - previousIndel = loc; - } - - return 1; - } - - private void calculateDistance(VariantContext snp, GenomeLoc previousIndel, GenomeLoc nextIndel) { - - GenomeLoc loc = GLparser.createGenomeLoc(snp.getChr(), snp.getStart(), snp.getEnd()); - - int previousDistance = -1, nextDistance = -1; - - if ( previousIndel != null ) { - // watch out for spanning deletions - if ( previousIndel.getStop() > snp.getStart() ) - previousDistance = 0; - else - previousDistance = snp.getStart() - previousIndel.getStop(); - } - - if ( nextIndel != null ) - nextDistance = nextIndel.getStart() - loc.getStart(); - - if ( previousDistance == -1 && nextDistance == -1 ) - return; - - int distance = -1; - if ( previousDistance == -1 ) - distance = nextDistance; - else if ( nextDistance == -1 ) - distance = previousDistance; - else - distance = Math.min(previousDistance, nextDistance); - - TiTv obj; - if ( distance < 0 ) - throw new IllegalStateException("Found a negative distance at " + loc); - else if ( distance < 6 ) - obj = counts[Bin0to5]; - else if ( distance < 11 ) - obj = counts[Bin6to10]; - else if ( distance < 16 ) - obj = counts[Bin11to15]; - else if ( distance < 21 ) - obj = counts[Bin16to20]; - else if ( distance < 26 ) - obj = counts[Bin21to25]; - else if ( distance < 31 ) - obj = counts[Bin26to30]; - else - obj = counts[BinMoreThan30]; - - if ( BaseUtils.isTransition(snp.getReference().getBases()[0], snp.getAlternateAllele(0).getBases()[0]) ) - obj.TiCount++; - else - obj.TvCount++; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - // flush the queue - for ( VariantContext vc : snpQueue ) - calculateDistance(vc, previousIndel, null); - - out.println("Bin\tnumTi\tnumTv\tTi/Tv"); - printLine(counts[Bin0to5], "0to5"); - printLine(counts[Bin6to10], "6to10"); - printLine(counts[Bin11to15], "11to15"); - printLine(counts[Bin16to20], "16to20"); - printLine(counts[Bin21to25], "21to25"); - printLine(counts[Bin26to30], "26to30"); - printLine(counts[BinMoreThan30], ">30"); - } - - private void printLine(TiTv obj, String s) { - out.println(String.format("%s\t%d\t%d\t%.2f", s, obj.TiCount, obj.TvCount, ((double)obj.TiCount/(double)obj.TvCount))); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BeagleOutputByDepthWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BeagleOutputByDepthWalker.java deleted file mode 100755 index 64e2633da..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BeagleOutputByDepthWalker.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.PrintStream; -import java.util.*; - -/** - * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file - * @help.summary Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file - */ -public class BeagleOutputByDepthWalker extends RodWalker { - - - public static final String POSTBEAGLE_EVAL_ROD_NAME = "postbeaglevcf"; - public static final String PREBEAGLE_EVAL_ROD_NAME = "prebeaglevcf"; - public static final String INPUT_HAPMAP_ROD_NAME = "hapmap"; - public static final String INPUT_COMP_ROD_NAME = "comp"; - - @Output - protected PrintStream outputWriter = null; - - - public void initialize() { - - } - - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - if( tracker != null ) { - GenomeLoc loc = context.getLocation(); - - VariantContext vc_postbgl = tracker.getVariantContext(ref,POSTBEAGLE_EVAL_ROD_NAME, null, loc, false); - VariantContext vc_prebgl = tracker.getVariantContext(ref,PREBEAGLE_EVAL_ROD_NAME, null, loc, false); - VariantContext vc_hapmap = tracker.getVariantContext(ref,INPUT_HAPMAP_ROD_NAME, null, loc, false); - VariantContext vc_comp = tracker.getVariantContext(ref,INPUT_COMP_ROD_NAME, null, loc, false); - if ( vc_postbgl == null || vc_prebgl == null || vc_comp == null) - return 0; - - - if (!vc_prebgl.hasGenotypes() || !vc_postbgl.hasGenotypes() ) - return 0; - - if (vc_postbgl.isFiltered()) - return 0; - - Map compGenotypes = vc_comp.getGenotypes(); - Integer alleleCountH = 0, chrCountH = 0, alleleCountEmp=0, chrCountEmp; - - // Get Hapmap AC and AF - if (vc_hapmap != null) { - Map hapmapGenotypes = vc_hapmap.getGenotypes(); - for ( String sample : vc_postbgl.getSampleNames() ) { - // use sample as key into genotypes structure - if (vc_postbgl.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) { - - Genotype hapmapGenotype = hapmapGenotypes.get(sample); - if (hapmapGenotype.isCalled()){ - chrCountH += 2; - if (hapmapGenotype.isHet()) { - alleleCountH += 1; - } else if (hapmapGenotype.isHomVar()) { - alleleCountH += 2; - } - } - } - } - } - else { - alleleCountH = -1; - chrCountH = -1; - } - - chrCountEmp = vc_postbgl.getChromosomeCount(); -//System.out.println(chrCountH); - - if ( vc_postbgl.getAlternateAlleles().size() > 0 ) { - for ( Allele allele : vc_postbgl.getAlternateAlleles() ) { - alleleCountEmp = alleleCountEmp+vc_postbgl.getChromosomeCount(allele); - } -//System.out.println(alleleCountH); - } - - - - - for ( String sample : vc_postbgl.getSampleNames() ) { - if (sample.compareToIgnoreCase("NA12878")!=0) - continue; - - // use sample as key into genotypes structure - - Genotype postbglGenotype = vc_postbgl.getGenotype(sample); - Genotype prebglGenotype = vc_prebgl.getGenotype(sample); - Genotype compGenotype = compGenotypes.get(sample); - - - outputWriter.format("%d %d %d %d %d ", vc_postbgl.getStart(), alleleCountH, chrCountH, - alleleCountEmp, chrCountEmp); - - - String dps = postbglGenotype.getAttributeAsString(VCFConstants.DEPTH_KEY); - - int dp; - if (dps.compareTo(".")==0) - dp = -1; - else - dp = Integer.valueOf(dps); - - int hg, bg, pg; - if (compGenotype.isNoCall()) - hg = -1; - else if (compGenotype.isHomRef()) - hg = 0; - else if (compGenotype.isHet()) - hg = 1; - else if (compGenotype.isHomVar()) - hg = 2; - else - throw new ReviewedStingException("Bug! invalid genotype!"); - - if (postbglGenotype.isNoCall()) - bg = -1; - else if (postbglGenotype.isHomRef()) - bg = 0; - else if (postbglGenotype.isHet()) - bg = 1; - else if (postbglGenotype.isHomVar()) - bg = 2; - else - throw new ReviewedStingException("Bug! invalid genotype!"); - - - if (prebglGenotype.isNoCall()) - pg = -1; - else if (prebglGenotype.isHomRef()) - pg = 0; - else if (prebglGenotype.isHet()) - pg = 1; - else if (prebglGenotype.isHomVar()) - pg = 2; - else - throw new ReviewedStingException("Bug! invalid genotype!"); - - outputWriter.format("%d %d %d %d\n",dp, hg, pg, bg); - - - } - return 1; - } - return 0; - } - - public Integer reduceInit() { - return 0; // Nothing to do here - } - - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - System.out.printf("Processed %d loci.\n", result); - - } - - - -} - diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java deleted file mode 100644 index b29917f48..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.CNV; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; - -import java.io.PrintStream; -import java.util.*; - - -/** - * Walks along all variant ROD loci, and tabulates the statistics of the CNVs detected. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) -@By(DataSource.REFERENCE_ORDERED_DATA) - -public class CNVstatsWalker extends RodWalker { - - @Output(doc = "File to which copy number counts should be written", required = true) - protected PrintStream out; - - @Argument(fullName = "alleleCountsCopyNumberFreqs", shortName = "AC_CNF", doc = "File to which discovered allele copy and copy number frequencies should be written", required = false) - private PrintStream alleleCountsCopyNumberFreqs = null; - - @Argument(fullName = "minFracPassGt", shortName = "minFracPassGt", doc = "Minimum fraction of callable genotypes required to report any genotypes at all", required = false) - private double minFracPassGt = 0.0; - - private LinkedList rodNames = null; - - public static String CNV_TAG = ""; - public static String CN_FIELD = "CN"; - - public static String SVLEN_FIELD = "SVLEN"; - public static String AC_FIELD = "AC"; - - public static int DIPLOID = 2; - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add("variant"); - } - - public boolean generateExtendedEvents() { - return false; - } - - public CNVstatistics reduceInit() { - return new CNVstatistics(); - } - - /** - * For each site, calculate the CNV stats. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return dummy Integer - */ - public CNVstatistics map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - logger.debug("REF:" + ref.getLocus()); - CNVstatistics stats = new CNVstatistics(); - - boolean requireStartHere = true; // only see each VariantContext once - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - if (vc.isSymbolic() && vc.isBiallelic()) { - Allele altAll = vc.getAlternateAllele(0); - if (altAll.isSymbolic() && altAll.getDisplayString().equals(CNV_TAG)) { - logger.debug("Found CNV at locus..."); - stats.cnvDeclaredLoci++; - - CopyNumberCounts cnc = new CopyNumberCounts(); - - boolean hasDiploidGt = false; - boolean hasNonDiploidGt = false; - for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { - Genotype gt = gtEntry.getValue(); - Integer copyNum = gt.getAttributeAsIntegerNoException(CN_FIELD); - if (copyNum != null && gt.isNotFiltered()) { - cnc.incrementCopyNumber(copyNum); - - if (copyNum == DIPLOID) - hasDiploidGt = true; - else - hasNonDiploidGt = true; - } - } - - double calledFreq = ((double) cnc.calledCount()) / vc.getNSamples(); - if (calledFreq < minFracPassGt) { // reset data as if it did not appear - cnc.resetCounts(); - } - else { - if (hasDiploidGt && hasNonDiploidGt) { - stats.diploidAndNonDiploidLoci++; - } - else { - if (hasDiploidGt) - stats.diploidOnlyLoci++; - if (hasNonDiploidGt) - stats.nonDiploidOnlyLoci++; - } - } - - int cnvEnd = vc.getEnd(); - Integer cnvLength = vc.getAttributeAsIntegerNoException(SVLEN_FIELD); - if (cnvLength != null) - cnvEnd = vc.getStart() + cnvLength - 1; - GenomeLoc vcLoc = getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart(), cnvEnd, true); - out.print(vcLoc); - - for (Map.Entry copyNumEntry : cnc.entrySet()) { - out.print("\t" + copyNumEntry.getKey() + ":" + copyNumEntry.getValue()); - } - out.println(); - - if (alleleCountsCopyNumberFreqs != null) { - Integer ac = vc.getAttributeAsIntegerNoException(AC_FIELD); - CopyNumberCounts.DeletionDuplicationFreqs freqs = cnc.deletionDuplicationFreqs(); - double cnvCount = freqs.deletionFreq + freqs.duplicationFreq; - - alleleCountsCopyNumberFreqs.println(vcLoc + "\t" + ac + "\t" + freqs.deletionFreq + "\t" + freqs.duplicationFreq + "\t" + cnvCount); - } - } - } - } - - return stats; - } - - public CNVstatistics reduce(CNVstatistics result, CNVstatistics total) { - if (result == null) - return total; - - return total.addIn(result); - } - - /** - * @param result statistics of CNV sites - */ - public void onTraversalDone(CNVstatistics result) { - System.out.println(); - System.out.println("--------------------------------------"); - System.out.println("CNV summary:"); - System.out.println("--------------------------------------"); - - System.out.println("cnvDeclaredLoci: " + result.cnvDeclaredLoci); - - System.out.println(); - System.out.println("noGenotypesLoci: " + result.noGenotypesLoci()); - - System.out.println(); - System.out.println("nonDiploidOnlyLoci: " + result.nonDiploidOnlyLoci); - - System.out.println(); - System.out.println("lociWithDiploid: " + result.lociWithDiploid()); - System.out.println("diploidOnlyLoci: " + result.diploidOnlyLoci); - System.out.println("diploidAndNonDiploidLoci: " + result.diploidAndNonDiploidLoci); - String onlyDiploidRateStr = percentageString(result.diploidOnlyLoci, result.lociWithDiploid()); - System.out.println("onlyDiploidRate = " + onlyDiploidRateStr + "%"); - - System.out.println(); - int noDiploidGenotypes = result.noGenotypesLoci() + result.nonDiploidOnlyLoci; - System.out.println("loci with no diploid genotypes: " + noDiploidGenotypes); - String noDiploidGtRateStr = percentageString(noDiploidGenotypes, result.cnvDeclaredLoci); - System.out.println("noDiploidGtRate = " + noDiploidGtRateStr + "%"); - } - - private static String percentageString(int numerator, int denominator) { - int NUM_DECIMAL_PLACES = 2; - - return new Formatter().format("%." + NUM_DECIMAL_PLACES + "f", MathUtils.percentage(numerator, denominator)).toString(); - } -} - - -class CNVstatistics { - protected int cnvDeclaredLoci, diploidOnlyLoci, nonDiploidOnlyLoci, diploidAndNonDiploidLoci; - - public CNVstatistics() { - this.cnvDeclaredLoci = 0; - this.diploidOnlyLoci = 0; - this.nonDiploidOnlyLoci = 0; - this.diploidAndNonDiploidLoci = 0; - } - - public CNVstatistics addIn(CNVstatistics other) { - this.cnvDeclaredLoci += other.cnvDeclaredLoci; - this.diploidOnlyLoci += other.diploidOnlyLoci; - this.nonDiploidOnlyLoci += other.nonDiploidOnlyLoci; - this.diploidAndNonDiploidLoci += other.diploidAndNonDiploidLoci; - - return this; - } - - public int noGenotypesLoci() { - return cnvDeclaredLoci - (diploidOnlyLoci + nonDiploidOnlyLoci + diploidAndNonDiploidLoci); - } - - public int lociWithDiploid() { - return diploidOnlyLoci + diploidAndNonDiploidLoci; - } -} - -class CopyNumberCounts { - private Map copyNumToCountsMap; - private int calledCount; - - public CopyNumberCounts() { - this.copyNumToCountsMap = new TreeMap(); - this.resetCounts(); - } - - public void incrementCopyNumber(int copyNum) { - Integer count = copyNumToCountsMap.get(copyNum); - if (count == null) - count = 0; - - copyNumToCountsMap.put(copyNum, count + 1); - calledCount++; - } - - public Set> entrySet() { - return copyNumToCountsMap.entrySet(); - } - - public int calledCount() { - return calledCount; - } - - public void resetCounts() { - copyNumToCountsMap.clear(); - calledCount = 0; - } - - class DeletionDuplicationFreqs { - public double deletionFreq; - public double duplicationFreq; - - public DeletionDuplicationFreqs() { - this.deletionFreq = 0; - this.duplicationFreq = 0; - } - } - - public DeletionDuplicationFreqs deletionDuplicationFreqs() { - int total = 0; - DeletionDuplicationFreqs freqs = new DeletionDuplicationFreqs(); - - for (Map.Entry copyNumEntry : this.entrySet()) { - int copyNum = copyNumEntry.getKey(); - int count = copyNumEntry.getValue(); - - if (copyNum < CNVstatsWalker.DIPLOID) { - freqs.deletionFreq += count; - } - else if (copyNum > CNVstatsWalker.DIPLOID) { - freqs.duplicationFreq += count; - } - - total += count; - } - - freqs.deletionFreq /= total; - freqs.duplicationFreq /= total; - - return freqs; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/GeneNamesIntervalWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/GeneNamesIntervalWalker.java deleted file mode 100755 index b9a600239..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/GeneNamesIntervalWalker.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.CNV; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.PrintStream; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Walks along reference and calculates the genes (from "refseq" ROD) for each interval. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = GeneNamesIntervalWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class)}) - -public class GeneNamesIntervalWalker extends RodWalker { - @Output - protected PrintStream out; - - public final static String REFSEQ_ROD_NAME = "refseq"; - - public final static String REFSEQ_NAME2 = "name2"; - - - public boolean isReduceByInterval() { - return true; - } - - public void initialize() { - } - - public boolean generateExtendedEvents() { - return false; - } - - public GeneNames reduceInit() { - return new GeneNames(); - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public GeneNames map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - return new GeneNames().addGenes(tracker.getReferenceMetaData(REFSEQ_ROD_NAME)); - } - - public GeneNames reduce(GeneNames add, GeneNames runningCount) { - if (add == null) - add = new GeneNames(); - - return runningCount.addIn(add); - } - - /** - * @param results the genes found in each interval. - */ - public void onTraversalDone(List> results) { - for (Pair result : results ) { - GenomeLoc loc = result.getFirst(); - GeneNames names = result.getSecond(); - - out.println(loc + "\t" + names); - } - } -} - -class GeneNames { - public Set geneNames; - - public GeneNames() { - this.geneNames = new HashSet(); - } - - public GeneNames addIn(GeneNames other) { - this.geneNames.addAll(other.geneNames); - - return this; - } - - public GeneNames addGenes(List refSeqRODs) { - for (Object refSeqObject : refSeqRODs) { - AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject; - if (refSeqAnnotation.containsColumnName(GeneNamesIntervalWalker.REFSEQ_NAME2)) - geneNames.add(refSeqAnnotation.getColumnValue(GeneNamesIntervalWalker.REFSEQ_NAME2)); - } - - return this; - } - - public String toString() { - if (geneNames.isEmpty()) - return "."; - - StringBuilder sb = new StringBuilder(); - - for (String gene : geneNames) - sb.append(gene).append(";"); - - return sb.toString(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/OverlapWithBedInIntervalWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/OverlapWithBedInIntervalWalker.java deleted file mode 100755 index b919b3422..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/OverlapWithBedInIntervalWalker.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.CNV; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; - -import java.io.PrintStream; -import java.util.List; - -/** - * Walks along reference and calculates the percent overlap with the BED file intervals for each -L interval. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = OverlapWithBedInIntervalWalker.INTERVALS_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -public class OverlapWithBedInIntervalWalker extends RodWalker { - @Output - protected PrintStream out; - - public final static String INTERVALS_ROD_NAME = "intervals"; - - - public boolean isReduceByInterval() { - return true; - } - - public void initialize() { - } - - public boolean generateExtendedEvents() { - return false; - } - - public CumulativeBaseOverlapCount reduceInit() { - return new CumulativeBaseOverlapCount(); - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public CumulativeBaseOverlapCount map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - return new CumulativeBaseOverlapCount().addIntervals(tracker.getGATKFeatureMetaData(INTERVALS_ROD_NAME, true)); - } - - public CumulativeBaseOverlapCount reduce(CumulativeBaseOverlapCount add, CumulativeBaseOverlapCount runningCount) { - if (add == null) - add = new CumulativeBaseOverlapCount(); - - return runningCount.addIn(add); - } - - /** - * @param results the genes found in each interval. - */ - public void onTraversalDone(List> results) { - for (Pair result : results ) { - GenomeLoc loc = result.getFirst(); - - CumulativeBaseOverlapCount overlapCount = result.getSecond(); - double meanOverlap = ((double) overlapCount.totalOverlapCount) / loc.size(); - - out.println(loc + "\t" + meanOverlap); - } - } -} - -class CumulativeBaseOverlapCount { - public int totalOverlapCount; - - public CumulativeBaseOverlapCount() { - this.totalOverlapCount = 0; - } - - public CumulativeBaseOverlapCount addIn(CumulativeBaseOverlapCount other) { - this.totalOverlapCount += other.totalOverlapCount; - - return this; - } - - public CumulativeBaseOverlapCount addIntervals(List interval) { - totalOverlapCount += interval.size(); - - return this; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(totalOverlapCount); - - return sb.toString(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java deleted file mode 100644 index 3cc63ca5c..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.CNV; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.PrintStream; -import java.util.List; - -/** - * Walks along reference and prints intervals of sequence not covered in ANY interval in "intervals" ROD. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = PrintIntervalsNotInBedWalker.INTERVALS_ROD_NAME, type = ReferenceOrderedDatum.class)}) -@By(DataSource.REFERENCE) // So that we will actually enter loci with no ROD on them - -public class PrintIntervalsNotInBedWalker extends RodWalker { - @Output - protected PrintStream out; - - public final static String INTERVALS_ROD_NAME = "intervals"; - - private GenomeLoc waitingInterval = null; - - public void initialize() { - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return number of intervals printed. - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - GenomeLoc curLoc = ref.getLocus(); - int curPos = curLoc.getStart(); - int printed = 0; - - List intervals = tracker.getGATKFeatureMetaData(INTERVALS_ROD_NAME, true); - if (intervals.isEmpty()) { - if (waitingInterval != null && curLoc.compareContigs(waitingInterval) == 0 && curPos == waitingInterval.getStop() + 1) { - waitingInterval = getToolkit().getGenomeLocParser().setStop(waitingInterval, curPos); - } - else { - printed += printWaitingIntervalAsBed(); - waitingInterval = ref.getLocus(); - } - } - else { - printed += printWaitingIntervalAsBed(); - } - - return printed; - } - - public Integer reduce(Integer add, Integer runningCount) { - if (add == null) - add = 0; - - return runningCount + add; - } - - /** - * @param result the genes found in each interval. - */ - public void onTraversalDone(Integer result) { - result += printWaitingIntervalAsBed(); - - System.out.println("Printed out " + result + " intervals."); - } - - private int printWaitingIntervalAsBed() { - if (waitingInterval == null) - return 0; - - out.println(waitingInterval.getContig() + "\t" + (waitingInterval.getStart() - 1) + "\t" + waitingInterval.getStop()); - waitingInterval = null; - - return 1; - } -} - diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/ReferenceFASTAforBedIntervalsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/ReferenceFASTAforBedIntervalsWalker.java deleted file mode 100755 index ea67900c1..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/ReferenceFASTAforBedIntervalsWalker.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.CNV; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.fasta.FastaSequence; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; - -import java.io.PrintStream; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Walks along reference and prints the reference sequence (as FASTA) for the BED file intervals ("intervals" ROD). - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = ReferenceFASTAforBedIntervalsWalker.INTERVALS_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -public class ReferenceFASTAforBedIntervalsWalker extends RodWalker { - @Output - protected PrintStream out; - - public final static String INTERVALS_ROD_NAME = "intervals"; - - private Map intervalSequences; - - private final static int LINE_WIDTH = 60; - - public void initialize() { - this.intervalSequences = new HashMap(); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return number of interval sequences printed - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - GenomeLoc curLoc = ref.getLocus(); - int curPos = curLoc.getStart(); - int entries = 0; - - List intervals = tracker.getGATKFeatureMetaData(INTERVALS_ROD_NAME, true); - for (GATKFeature interval : intervals) { - GenomeLoc loc = interval.getLocation(); - /* TODO: note that an interval may actually start BEFORE here, but not be covered, but would need to cache the remappings - of origLoc -> newLoc, and then setName(newLoc.toString()) */ - - FastaSequence seq = null; - if (loc.getStart() == curPos) { // at the start of this interval: - seq = new FastaSequence(out, LINE_WIDTH, false); - seq.setName(loc.toString()); - intervalSequences.put(loc, seq); - } - else { - seq = intervalSequences.get(loc); - } - - seq.append(String.valueOf((char) ref.getBase())); - - if (loc.getStop() == curPos) { // at the end of this interval: - intervalSequences.remove(loc); - seq.flush(); - entries++; - } - } - - return entries; - } - - public Integer reduce(Integer add, Integer runningCount) { - if (add == null) - add = 0; - - return runningCount + add; - } - - /** - * @param result the genes found in each interval. - */ - public void onTraversalDone(Integer result) { - result += intervalSequences.size(); - - for (Map.Entry locSeqEntry : intervalSequences.entrySet()) { - GenomeLoc interval = locSeqEntry.getKey(); - FastaSequence seq = locSeqEntry.getValue(); - - int actualStop = interval.getStart() + (int)seq.getCurrentCount() - 1; - GenomeLoc actualInterval = getToolkit().getGenomeLocParser().setStop(interval, actualStop); - seq.setName(actualInterval.toString()); - - seq.flush(); - } - - System.out.println("Printed out " + result + " sequence entries."); - } -} - diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java deleted file mode 100644 index d454dec7e..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.PrintStream; -import java.util.*; - -import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; - -/** - * Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes - * - * @author depristo - * @since May, 2011 - * @help.summary Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes - */ - -@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="alleles",type=VariantContext.class)) -@Allows(value={DataSource.READS, DataSource.REFERENCE}) - -// Ugly fix because RodWalkers don't have access to reads -@By(DataSource.REFERENCE) -@Reference(window=@Window(start=-200,stop=200)) -public class CalibrateGenotypeLikelihoods extends RodWalker implements TreeReducible { - public static final String COMP_NAME = "alleles"; - - @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) - private int mbq = -1; - - @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false) - private double deletions = -1; - - //@Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false) - private double callConf = 0; - - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - Set samples; - - /** - * Trivial wrapper class. Data is a collection of Datum. - */ - public static class Data { - Collection values; - - public Data() { this(new LinkedList()); } - public Data(Collection data) { this.values = data; } - - final public static Data EMPTY_DATA = new Data(Collections.emptyList()); - } - - /** - * The raw datapoints we are tracking for a specific site for a specific sample. - * read group id and sample name. The PL object. - * the ref and alt alleles. The type of the variant context. And the genotype of the - * comp. track at this site. - */ - public static class Datum implements Comparable { - final String rgID, sample; - final GenotypeLikelihoods pl; - final String ref, alt; - final VariantContext.Type siteType; - final Genotype.Type genotypeType; - - @Override - public int compareTo(Datum o) { - int bySample = sample.compareTo(o.sample); - int byRG = rgID.compareTo(o.rgID); - return bySample != 0 ? bySample : byRG; - } - - public Datum(String ref, String alt, String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) { - this.ref = ref; - this.alt = alt; - this.sample = sample; - this.rgID = rgID; - this.pl = pl; - this.siteType = siteType; - this.genotypeType = genotypeType; - } - } - - private UnifiedGenotyperEngine snpEngine; - private UnifiedGenotyperEngine indelEngine; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - // We only operate over the samples in the BAM file - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - logger.info("Samples: " + samples); - if ( samples.size() > 1 ) // todo -- remove me when we support multiple samples - throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support comparison of multiple samples simulatenously. To enable, see TODO in code"); - - List rodList = this.getToolkit().getRodDataSources(); - if ( rodList.size() != 1 ) - throw new UserException.BadInput("You should provide exactly one genotype VCF"); - if ( !rodList.get(0).getName().equals(COMP_NAME)) - throw new UserException.BadInput("The ROD track has to be named \""+ COMP_NAME +"\". Not " + rodList.get(0).getName()); - - // Filling in SNP calling arguments for UG - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - uac.NO_SLOD = true; - if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; - if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; - uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); - - // Adding the INDEL calling arguments for UG - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; - indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); - } - - @Override - // todo -- remove me when the new indel genotyping is done - public boolean generateExtendedEvents() { return true; } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - public Data map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - if ( tracker == null || tracker.getNBoundRodTracks() == 0 ) - return Data.EMPTY_DATA; - - // Grabs a usable VariantContext from the Alleles ROD - VariantContext vcComp = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger); - if( vcComp == null ) - return Data.EMPTY_DATA; - - Data data = new Data(); - for ( String sample : samples ) { - // What's the genotype of our sample at this record? - Genotype compGT = getGenotype(tracker, ref, sample, COMP_NAME); - if ( compGT == null || compGT.isNoCall() ) - continue; - - // For each read group - // todo -- this only works with a single sample right now. For multi-sample BAMs - // todo -- this loop needs to be refactored so that the spliting by read group only happens once - // todo -- and the read groups appropriate to each sample is used. - Map byRG = AlignmentContextUtils.splitContextByReadGroup(context, getToolkit().getSAMFileHeader().getReadGroups()); - //byRG.put(new SAMReadGroupRecord("ALL"), context); // uncomment to include a synthetic RG for all RG for the sample - for ( Map.Entry rgAC : byRG.entrySet() ) { - VariantCallContext call; - if ( vcComp.isIndel() ) { - throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support indel GL calibration. This capability needs to be tested and verified to be working with the new genotyping code for indels in UG"); - //call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); - } else { - call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); - } - - if ( call == null ) - throw new ReviewedStingException("Unexpected genotyping failure " + sample + " at " + ref.getLocus() + " call " + call); - - Genotype rgGT = call.getGenotype(sample); - - if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) { - Datum d = new Datum(vcComp.getReference().getBaseString(), vcComp.getAlternateAllele(0).getBaseString(), - sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType()); - data.values.add(d); - } - } - } - - return data; - } - - /** - * Convenience function that determines the genotype in the comp VC for sample - * - * @param tracker - * @param ref - * @param sample - * @param rod - * @return - */ - private Genotype getGenotype(RefMetaDataTracker tracker, ReferenceContext ref, String sample, String rod) { - for ( VariantContext vc : tracker.getVariantContexts(ref, rod, null, ref.getLocus(), true, false) ) { - if ( vc.isNotFiltered() && vc.hasGenotype(sample) ) - return vc.getGenotype(sample); - else - return null; - } - - return null; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public Data reduceInit() { - return new Data(); - } - - @Override - public Data treeReduce( final Data sum1, final Data sum2) { - sum2.values.addAll(sum1.values); - return sum2; - } - - @Override - public Data reduce( final Data mapValue, final Data reduceSum ) { - return treeReduce(mapValue, reduceSum); - } - - @Override - public void onTraversalDone(Data data) { - // print the header - List pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD"); - List fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD"); - out.println(Utils.join("\t", fields)); - - // determine the priors by counting all of the events we've seen in comp - double[] counts = new double[]{1, 1, 1}; - for ( Datum d : data.values ) { counts[d.genotypeType.ordinal()-1]++; } - double sum = MathUtils.sum(counts); - logger.info(String.format("Types %s %s %s", Genotype.Type.values()[1], Genotype.Type.values()[2], Genotype.Type.values()[3])); - logger.info(String.format("Counts %.0f %.0f %.0f %.0f", counts[0], counts[1], counts[2], sum)); - double[] log10priors = new double[]{Math.log10(counts[0] / sum), Math.log10(counts[1] / sum), Math.log10(counts[2] / sum)}; - logger.info(String.format("Priors %.2f %.2f %.2f", log10priors[0], log10priors[1], log10priors[2])); - - // emit the molten data set - for ( Datum d : data.values ) { - double[] log10pGGivenD = d.pl.getAsVector().clone(); - for ( int i = 0; i < log10priors.length; i++ ) log10pGGivenD[i] += log10priors[i]; - double[] pOfGGivenD = MathUtils.normalizeFromLog10(log10pGGivenD, false); - for ( int i = 0; i < pGNames.size(); i++ ) { - int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9)); - if ( q > 1 ) { // tons of 1s, and not interesting - out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n", - d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(), - pGNames.get(i), q); - } - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CountIntervals.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CountIntervals.java deleted file mode 100755 index 06db31622..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CountIntervals.java +++ /dev/null @@ -1,62 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; - -import java.util.List; -import java.io.PrintStream; - -/** - * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but - * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. - * This was its very first use. - */ -public class CountIntervals extends RefWalker { - @Output - PrintStream out; - - @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) - int numOverlaps = 2; - - public Long reduceInit() { - return 0l; - } - - public boolean isReduceByInterval() { return true; } - - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) { - return null; - } - - List checkIntervals = tracker.getGATKFeatureMetaData("check",false); - return (long) checkIntervals.size(); - } - - public Long reduce(Long loc, Long prev) { - if ( loc == null ) { - return 0l; - } else { - return Math.max(prev,loc); - } - } - - public void onTraversalDone(List> finalReduce) { - long count = 0; - for ( Pair g : finalReduce ) { - if ( g.second >= numOverlaps) { - count ++; - } - } - out.printf("Number of contiguous intervals: %d",count); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CreateTiTvTrack.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CreateTiTvTrack.java deleted file mode 100755 index 63b17bda3..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CreateTiTvTrack.java +++ /dev/null @@ -1,109 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.wiggle.WiggleHeader; -import org.broadinstitute.sting.utils.wiggle.WiggleWriter; - -import java.util.ArrayList; -import java.io.PrintStream; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Jul 21, 2010 - */ -public class CreateTiTvTrack extends RodWalker { - @Output - PrintStream out; - - @Argument(shortName="size",doc="Size of the window",required = true) - int size = -1; - - private WiggleWriter writer; - - public TiTvWindow reduceInit() { - writer = new WiggleWriter(out); - writer.writeHeader(new WiggleHeader("TiTv",String.format("The Transition Transversion rate across the genome using variants windows of size %d",size))); - return new TiTvWindow(size); - } - - // public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - // return tracker == null || tracker.getVariantContext(ref, "variant", null, context.getLocation(), true).isFiltered(); - //} - - public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc; - if ( tracker != null ) { - vc = tracker.getVariantContext(ref, "variant", null, context.getLocation(), true); - return vc; - } else { - return null; - } - } - - public TiTvWindow reduce(VariantContext vc, TiTvWindow window) { - if ( vc == null || ! vc.isSNP() || vc.getAlternateAlleles().size() > 1) { - return window; - } - - window.update(VariantContextUtils.isTransition(vc)); - if ( window.getTiTv() != null ) { - writer.writeData(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc),window.getTiTv()); - } - - return window; - } - - public void onTraversalDone(TiTvWindow window) { - - } - -} - -class TiTvWindow { - long nTi; - long nTv; - ArrayList variants; - int maxSize; - - public TiTvWindow(int size) { - maxSize = size; - variants = new ArrayList(size); - } - - public void update(Boolean isTi) { - if ( variants.size() == maxSize ) { - Boolean first = variants.remove(0); - if ( first ) { - nTi--; - } else { - nTv--; - } - } - - variants.add(isTi); - if ( isTi ) { - nTi++; - } else { - nTv++; - } - - //System.out.println(variants.size()); - } - - public Double getTiTv() { - if ( variants.size() == maxSize ) { - return ( nTi + 1.0 )/(nTv + 1.0); - } else { - return null; // window not full - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java deleted file mode 100644 index 4dfc02687..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import net.sf.samtools.SAMRecord; - -import java.util.*; -import java.io.PrintStream; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jan 3, 2010 - * Time: 1:58:38 PM - * To change this template use File | Settings | File Templates. - */ -public class DSBWalkerV3 extends ReadWalker { - @Output - PrintStream out; - - @Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true) - int WINDOW_SIZE = 100; - @Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true) - double ENRICHMENT_CUTOFF = 5.0; - @Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+ - "(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+ - "so that control*enrichmentCutoff is too low to be convincing)",required=true) - int MIN_SIGNAL = 10; - @Argument(fullName="coverageFactor",shortName="cf",doc="Total number of uniquely mapped signal reads/total number of uniquely mapped control reads",required=false) - double COVERAGE_FACTOR=1.0; - @Argument(fullName="coverageFactorNU",shortName="cfnu",doc="Total number of non-uniquely mapped signal reads/total number of non-uniquely mapped control reads",required=false) - double COVERAGE_FACTOR_NU=1.0; - - private Set signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able - private Set controlReadGroups ; // to properly assign the reads coming from a merged stream - - private GenomeLoc currentWindow = null; - private String currentContig = "chrM"; - - - private LinkedList readsInSignalWindow = null; - private LinkedList readsInControlWindow = null; - - private WindowStats signalCountsInCurrWindow = new WindowStats(); - private WindowStats controlCountsInCurrWindow = new WindowStats(); - - - // following variables are used by emitWindow to buffer adjacent windows - private int MERGE_CUTOFF = -1; - - private long regionStart = -1; - private long lastWindowStart = -1; - private int addedSinceLastEmit = 0; // how many sliding window steps where buffered since the last emit (i.e. since the last window that really passed significance criteria) - // buffered read count stats for the windows inside the currently held merged print region: - private List signalReadCountsBuffer = new ArrayList(1000); - private List controlReadCountsBuffer = new ArrayList(1000); - - - /** Clears buffered reads and all counts. DOES NOT clear buffered print region */ - private void resetWindows() { - readsInSignalWindow.clear(); - readsInControlWindow.clear(); - signalCountsInCurrWindow.clear(); - controlCountsInCurrWindow.clear(); - } - - private void addSignal(SAMRecord read) { - readsInSignalWindow.add(read); - signalCountsInCurrWindow.addRead(read); - } - - private void addControl(SAMRecord read) { - readsInControlWindow.add(read); - controlCountsInCurrWindow.addRead(read); - } - - /** Discard signal reads that start strictly before the specified position and - * update associated counts - * @param pos - */ - private void purgeSignal(long pos) { - Iterator it = readsInSignalWindow.iterator(); - while ( it.hasNext() ) { - SAMRecord r = it.next(); - if ( r.getAlignmentStart() >= pos ) return; // we are done - - // read starts before pos: discard it and update the counts: - signalCountsInCurrWindow.removeRead(r); - it.remove(); - } - } - - /** Discard signal reads that start strictly before the specified position and - * update associated counts - * @param pos - */ - private void purgeControl(long pos) { - Iterator it = readsInControlWindow.iterator(); - while ( it.hasNext() ) { - SAMRecord r = it.next(); - if ( r.getAlignmentStart() >= pos ) return; // we are done - - // read starts before pos: discard it and update the counts: - controlCountsInCurrWindow.removeRead(r); - it.remove(); - } - } - - private void resetWindowMergingBuffer(long start) { - regionStart = start; - lastWindowStart = start; - signalReadCountsBuffer.clear(); - controlReadCountsBuffer.clear(); - signalReadCountsBuffer.add(signalCountsInCurrWindow.clone()); - controlReadCountsBuffer.add(controlCountsInCurrWindow.clone()); - } - - /** Delayed print: the window starting at 'start' will be added to the print buffer; if the window is close enough - * to the current contents if the buffer, the addition will result in merging the window with the buffer; - * otherwise, the old contents of the buffer will be printed and the buffer will be re-initialized with new window. - * It is assumed that counters are in synch with the start position passed to this method. - * @param start - */ - private void emitWindow(long start) { - // System.out.println("Emitting at "+start); - - if ( regionStart == -1 ) { // we did not keep any region so far; initialize the buffer and return, will print later - resetWindowMergingBuffer(start); - addedSinceLastEmit = 0; - return; - } - - if ( start > lastWindowStart + MERGE_CUTOFF ) { - // this loop is a dummy: we have already cleared those unneeded - // counts in shiftWindows(); stays here to avoid generating bugs later - // if we change something in shiftWindows() - for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) { - signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1); - controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1); - } - printRegion(); - resetWindowMergingBuffer(start); - return; - } - - // the current window is too close to the previous one: we have to merge; - // NOTE: if window is too close, bufferAccepts() returned true, so the counts are already - // added. - lastWindowStart = start; - addedSinceLastEmit = 0; -// signalReadCountsBuffer.add(uniqueSignalReads); -// controlReadCountsBuffer.add(uniqueControlReads); - - } - - private boolean bufferAccepts(long pos) { - return ( regionStart != -1 && pos <= lastWindowStart+MERGE_CUTOFF); - } - - - - private void printRegion() { - if ( regionStart == -1 ) return; - - long regionStop = lastWindowStart+WINDOW_SIZE-1; - - double[] tmpEnrU = new double[signalReadCountsBuffer.size()]; - int[] tmpSignalU = new int[signalReadCountsBuffer.size()]; - int[] tmpControlU = new int[signalReadCountsBuffer.size()]; - double[] tmpEnrNU = new double[signalReadCountsBuffer.size()]; - int[] tmpSignalNU = new int[signalReadCountsBuffer.size()]; - int[] tmpControlNU = new int[signalReadCountsBuffer.size()]; - - double[] tmpFWDSignalFracU = new double[signalReadCountsBuffer.size()]; - double[] tmpFWDControlFracU = new double[signalReadCountsBuffer.size()]; - double[] tmpFWDSignalFracNU = new double[signalReadCountsBuffer.size()]; - double[] tmpFWDControlFracNU = new double[signalReadCountsBuffer.size()]; - - int lastInd = signalReadCountsBuffer.size() - 1; - - // out.println("Size="+signalReadCountsBuffer.size()+":"); - - for ( int i = 0 ; i <= lastInd ; i++ ) { - tmpEnrU[i]= ( ((double) signalReadCountsBuffer.get(i).uniqueReads) / (controlReadCountsBuffer.get(i).uniqueReads+1.0 ) ) / COVERAGE_FACTOR ; - - tmpSignalU[i] = signalReadCountsBuffer.get(i).uniqueReads; - tmpControlU[i] = controlReadCountsBuffer.get(i).uniqueReads; - - tmpEnrNU[i]= ( ((double) signalReadCountsBuffer.get(i).nonUniqueReads) / (controlReadCountsBuffer.get(i).nonUniqueReads+1.0 ) ) / COVERAGE_FACTOR_NU ; - - tmpSignalNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads; - tmpControlNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads; - - tmpFWDSignalFracU[i] = signalReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).uniqueFWDReads) / signalReadCountsBuffer.get(i).uniqueReads ) : 0.5; - tmpFWDControlFracU[i] = controlReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).uniqueFWDReads) / controlReadCountsBuffer.get(i).uniqueReads ) : 0.5; - tmpFWDSignalFracNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).nonUniqueFWDReads) / signalReadCountsBuffer.get(i).nonUniqueReads ) : 0.5; - tmpFWDControlFracNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).nonUniqueFWDReads) / controlReadCountsBuffer.get(i).nonUniqueReads ) : 0.5; - } - - Arrays.sort(tmpEnrU); - Arrays.sort(tmpSignalU); - Arrays.sort(tmpControlU); - - Arrays.sort(tmpEnrNU); - Arrays.sort(tmpSignalNU); - Arrays.sort(tmpControlNU); - - Arrays.sort(tmpFWDSignalFracU); - Arrays.sort(tmpFWDControlFracU); - Arrays.sort(tmpFWDSignalFracNU); - Arrays.sort(tmpFWDControlFracNU); - - - out.print(currentContig+":"+regionStart+"-"+regionStop+"\t"+ - (regionStop-regionStart+1) +"\t"+ - "signal_unique:"+ tmpSignalU[0]+"-"+ tmpSignalU[lastInd/2]+"-"+ tmpSignalU[lastInd]+"\t"+ - "control_unique:"+ tmpControlU[0]+"-"+ tmpControlU[lastInd/2]+"-"+ tmpControlU[lastInd]); - - out.printf("\tsignal_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDSignalFracU[0],tmpFWDSignalFracU[lastInd/2],tmpFWDSignalFracU[lastInd]); - out.printf("\tcontrol_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDControlFracU[0],tmpFWDControlFracU[lastInd/2],tmpFWDControlFracU[lastInd]); - - out.print("\tsignal_nonnunique:"+ tmpSignalNU[0]+"-"+ tmpSignalNU[lastInd/2]+"-"+ tmpSignalNU[lastInd]+"\t"+ - "control_nonunique:"+ tmpControlNU[0]+"-"+ tmpControlNU[lastInd/2]+"-"+ tmpControlNU[lastInd]); - - out.printf("\tsignal_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDSignalFracNU[0],tmpFWDSignalFracNU[lastInd/2],tmpFWDSignalFracNU[lastInd]); - out.printf("\tcontrol_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDControlFracNU[0],tmpFWDControlFracNU[lastInd/2],tmpFWDControlFracNU[lastInd]); - - out.printf("\tnorm_enrichment_unique:%.2f-%.2f-%.2f",tmpEnrU[0],tmpEnrU[lastInd/2],tmpEnrU[lastInd]); - out.printf("\tnorm_enrichment_nonunique:%.2f-%.2f-%.2f",tmpEnrNU[0],tmpEnrNU[lastInd/2],tmpEnrNU[lastInd]); - - // if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER"); - out.println(); - - regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty - // System.exit(1); - } - - private void updateWindowMergingBuffer(long i) { - if ( bufferAccepts(i) ) { - // we are not too far away from last window added to the buffer that actually passed significance criteria; - // in this case we have to keep buffering since another significant window may be encountered soon - // System.out.println("Updating buffer at "+i+" with "+ uniqueSignalReads); - signalReadCountsBuffer.add(signalCountsInCurrWindow.clone()); - controlReadCountsBuffer.add(controlCountsInCurrWindow.clone()); - addedSinceLastEmit++; - } else { - // we are too far from the last significant window; if another significant window comes later, it will not - // be merged into this region but will start a new one. In this case we have to erase all the counts we have been - // saving since the last significant window (the latter is where the current region is going to end!) - for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) { - signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1); - controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1); - } - printRegion(); // print current region right away, why not? next significant window will start new region for sure. - } - - } - - private void shiftWindows(int pos) { - // we shift windows when there is a read that does not fit into the current window. - // the position, to which the shift is performed, is the first position such that the new read - // can be accomodated. Hence we can safely slide up to pos, only discarding reads that go out of scope - - // we are guaranteed that there will be no new reads to add until we reach pos. - - - for ( long i = currentWindow.getStart() ; i < pos ; i++ ) { -// if ( readsInSignalWindow.size() == 0 ) { -// i = pos-1; -// continue; -// }; - -// if ( readsInSignalWindow.getFirst().getAlignmentStart() > i ) { -// i = readsInSignalWindow.getFirst().getAlignmentStart() - 1; // jump directly to next read position -// continue; -// } - - purgeSignal(i); // remove all the reads that start before current position i (and update all the counters) - purgeControl(i); - - updateWindowMergingBuffer(i); - - if ( ( controlCountsInCurrWindow.uniqueReads + 1 ) * ENRICHMENT_CUTOFF < MIN_SIGNAL ) { - // too few control reads - if ( signalCountsInCurrWindow.uniqueReads >= MIN_SIGNAL ) { - // emit signal only if it is higher that hard cut-off: - emitWindow(i); // print current window (print can be buffered and delayed!) - } - } else { - // enough control reads; - // check for actual enrichment: - if ( ((double) signalCountsInCurrWindow.uniqueReads) / (controlCountsInCurrWindow.uniqueReads+1.0) > ENRICHMENT_CUTOFF ) { - emitWindow(i); // print current window (print can be buffered and delayed!) - } - } - - } - - // we emitted intermediate windows up to pos-1 as/if needed and purged everything that starts before pos-1 - // now we have to purge everything that starts before pos and return (no emitting yet, as we are about to add a read upon return): - - purgeSignal(pos); - purgeControl(pos); - - currentWindow = getToolkit().getGenomeLocParser().createGenomeLoc(currentWindow.getContig(),pos,pos+WINDOW_SIZE-1); - } - - @Override - public void initialize() { - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( nSams != 2 ) { - out.println("ERROR: two input bam files (signal and backround control) must be specified"); - System.exit(1); - } - List> readGroupSets = getToolkit().getMergedReadGroupsByReaders(); - signalReadGroups = readGroupSets.get(0); -// System.out.println(signalReadGroups.size()+" read groups in signal"); - controlReadGroups = readGroupSets.get(1); -// System.out.println(controlReadGroups.size()+" read groups in control"); - - String sequenceName = getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequence(0).getSequenceName(); - currentWindow = getToolkit().getGenomeLocParser().createGenomeLoc(sequenceName,1,WINDOW_SIZE); - readsInSignalWindow = new LinkedList(); - readsInControlWindow = new LinkedList(); - - MERGE_CUTOFF = WINDOW_SIZE; - ENRICHMENT_CUTOFF *= COVERAGE_FACTOR; - currentContig = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(0).getSequenceName(); - } - - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - if ( AlignmentUtils.isReadUnmapped(read) ) return 0; - - if ( read.getReferenceIndex() > currentWindow.getContigIndex() ) { - printRegion(); // print all we had on the previous contig - - currentWindow = ref.getGenomeLocParser().createGenomeLoc(read.getReferenceName(), - read.getAlignmentStart(), - read.getAlignmentStart()+WINDOW_SIZE-1); - currentContig = read.getReferenceName(); - resetWindows(); - } else { - // we are on the same contig - if ( read.getAlignmentEnd() > currentWindow.getStop() ) { - // can not accomodate the read inside the current window - shift! - // System.out.println("read ends at "+read.getAlignmentEnd()+" window ends at "+currentWindow.getStop()+ " shifting to "+ (currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )) +" ("+uniqueSignalReads+"/"+uniqueControlReads+")"); - - // while shifting the window, the following method will issue (delayed) print commands for - // all intermediate windows that pass significance criteria: - shiftWindows(currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )); - } - // now the read will fit into the window - } - - // at this point we are guaranteed that the read will fit into the window - - if ( signalReadGroups.contains( read.getReadGroup().getReadGroupId() ) ) { - addSignal(read); - } else if ( controlReadGroups.contains( read.getReadGroup().getReadGroupId() )) { - addControl(read); - } else { - throw new UserException.MalformedBAM(read, "Read "+read + " belongs to unrecognized read group"); - } - return 1; - } - - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return value+sum; //To change body of implemented methods use File | Settings | File Templates. - } - - - /** Auxiliary class that encapsulates the task of monitoring counts of various read traits in some set of reads - * (for instance, reads in the current window). Counted traits include uniquely/non-uniquely mapped reads, - * forward-strand aligned reads etc. - */ - class WindowStats implements Cloneable { - public int uniqueReads = 0; - public int nonUniqueReads = 0; - public int uniqueFWDReads = 0; - public int nonUniqueFWDReads = 0; - - /** Reset all counts to 0 */ - public void clear() { - uniqueReads = nonUniqueReads = uniqueFWDReads = nonUniqueFWDReads = 0; - } - - /** Examines the read and increments the counts for all the monitored traits observed in this read. */ - public void addRead(SAMRecord r) { - if ( r.getMappingQuality() == 0 ) { - // nonunique - nonUniqueReads++; - if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads++; - } else { - // unique - uniqueReads++; - if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads++; - } - } - - /** Examines the read and decrements the counts for all the monitored traits observed in this read. */ - public void removeRead(SAMRecord r) { - if ( r.getMappingQuality() == 0 ) { - // nonunique - nonUniqueReads--; - if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads--; - } - else { - // unique - uniqueReads--; - if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads--; - } - - } - - /** allocates new object, copies this object into it, and returns the copy */ - public WindowStats clone() { - WindowStats ret = new WindowStats(); - ret.uniqueReads = this.uniqueReads; - ret.nonUniqueReads = this.nonUniqueReads; - ret.uniqueFWDReads = this.uniqueFWDReads; - ret.nonUniqueFWDReads = this.nonUniqueFWDReads; - return ret; - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java deleted file mode 100644 index 9c370a597..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java +++ /dev/null @@ -1,86 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.util.CloseableIterator; -import org.broad.tribble.dbsnp.DbSNPCodec; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; - -/** - * DbSNPWindowCounter - * - * Count the number of upstream and downstream dbSNP entries from the current position using the specified window size. - * (really the window size upstream and downstream, so windowSize * 2) - * - * @Author Aaron - * @Date May 7th, 2010 - */ -@By(DataSource.REFERENCE) -@Requires({DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -public class DbSNPWindowCounter extends LocusWalker { - - // what we read in new tracks with - private RMDTrack track; - - @Output - private PrintStream out; - - @Argument(fullName = "dbSNPFile", shortName = "db", doc="The dbsnp file to search upstream and downstream for nearby snps", required = true) - private File myDbSNPFile; - - @Argument(fullName = "dbSNPWindowSize", shortName = "dbw", doc="The distance to look both upstream and downstream for SNPs", required = true) - private int windowSize; - - - public void initialize() { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - track = builder.createInstanceOfTrack(DbSNPCodec.class,myDbSNPFile); - } - - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - CloseableIterator dbSNPs; - - // our upstream and downstream window locations - int windowStart = (int)Math.max(context.getLocation().getStart()-windowSize,0); - int windowStop = (int)context.getLocation().getStop()+windowSize; - - // query the dnSNP iterator - try { - dbSNPs = track.query(getToolkit().getGenomeLocParser().createGenomeLoc(context.getContig(),windowStart,windowStop)); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(myDbSNPFile, e); - } - - // count the number of dbSNPs we've seen - int counter = 0; - while(dbSNPs.hasNext()) - counter++; - out.println(context.getContig() + ":" + windowStart + "-" + context.getContig() + ":" + windowStop + "=" + - counter + " (dbSNP records)"); - return 1; - } - - public Long reduceInit() { return 0l; } - - public Long reduce(Integer value, Long sum) { - return value + sum; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java deleted file mode 100644 index 5dcfa7cd9..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java +++ /dev/null @@ -1,240 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broad.tribble.bed.BEDFeature; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.commandline.Output; - -import java.util.*; -import java.io.PrintStream; - -/** - * Takes an interval list and annotates intervals with genes and exons falling within that interval - * Was written in order to annotate the Whole Exome Agilent designs at the Broad institute - * Bind the refseq rod as -B refseq,refseq,/path/to/refGene.txt - * Bind the interval list as -B interval_list,bed,/path/to/intervals.interval_list - * Bind the additional files file as -B gene*,bed,/path/to/other/file.bed - * @Author chartl - * @Date Apr 26, 2010 - */ -public class DesignFileGeneratorWalker extends RodWalker { - - private HashMap intervalBuffer = new HashMap(); - private HashSet refseqBuffer = new HashSet(); - private HashMap currentBedFeatures = new HashMap(); - - @Output - PrintStream out; - - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - // three items to look up: interval_list, refseq, gene* - if ( tracker == null ) { - return null; - } - - List intervalsList= tracker.getGATKFeatureMetaData("interval_list",false); - List refseqList = tracker.getReferenceMetaData("refseq"); - List bedList = tracker.getGATKFeatureMetaData("gene",false); - - // put any unprocessed intervals into the interval buffer - - if ( intervalsList != null && intervalsList.size() > 0 ) { - for ( Object interval : intervalsList ) { - GenomeLoc loc = ((GATKFeature)interval).getLocation(); - if ( ! intervalBuffer.keySet().contains(loc) ) { - intervalBuffer.put(loc,new IntervalInfoBuilder()); - } - } - } - - // put any new refseq transcripts into the refseq buffer - - if ( refseqList != null && refseqList.size() > 0 ) { - for ( Object seq : refseqList ) { - if ( ! refseqBuffer.contains( (RefSeqFeature) seq) ) { - refseqBuffer.add( (RefSeqFeature) seq ); - } - } - } - - // update the bed features - for ( GATKFeature additionalGene : bedList ) { - currentBedFeatures.put(additionalGene.getName(),(BEDFeature) additionalGene.getUnderlyingObject()); - } - - cleanup(ref); - long updated = process(); - return updated; - } - - /** - * Workhorse method of this walker. Traverses intervals in the buffer and updates their corresponding - * info objects for overlaps with genes and gene exons. Can be expensive as it checks all buffered intervals - * against all buffered refseq rods, and against the current TCGA. - * @return the number of updated interval objects - */ - private long process() { - long nUpdate = 0l; - for ( GenomeLoc interval : intervalBuffer.keySet() ) { - for ( RefSeqFeature refseq : refseqBuffer ) { - if ( interval.overlapsP(refseq.getLocation()) && - ! intervalBuffer.get(interval).geneNames.contains(refseq.getTranscriptUniqueGeneName()) ) { - // if the interval overlaps the gene transcript; and the gene is not already represented in the interval - intervalBuffer.get(interval).update(refseq.getTranscriptUniqueGeneName(), - refseq.getExonsInInterval(interval), - refseq.getExonNumbersInInterval(interval)); - nUpdate++; - } - } - - for ( Map.Entry additionalGenes : currentBedFeatures.entrySet() ) { - GenomeLoc entryLoc = getToolkit().getGenomeLocParser().createGenomeLoc(additionalGenes.getValue().getChr(),additionalGenes.getValue().getStart(),additionalGenes.getValue().getEnd()); - if ( interval.overlapsP(entryLoc) && - ! additionalGenes.getValue().getName().equals("") && - ! intervalBuffer.get(interval).geneNames.contains(additionalGenes.getKey()+"_"+additionalGenes.getValue().getName())) { - - intervalBuffer.get(interval).update(additionalGenes.getKey()+"_"+additionalGenes.getValue().getName(), - new ArrayList(Arrays.asList(entryLoc)), - null); - nUpdate ++; - } - - } - } - - return nUpdate; - } - - /** - * Pruning method -- removes from the buffers all entries coming before the reference locus - * Does the same for intervals -- though upon removing them from the buffer, it prints them - * @return diddly - */ - public void cleanup(ReferenceContext ref) { - List toRemove = new ArrayList(); - for ( RefSeqFeature refseq : refseqBuffer ) { - if ( refseq.getLocation().isBefore(ref.getLocus()) ) { - toRemove.add(refseq); - } - } - - for ( RefSeqFeature refseq : toRemove ) { - refseqBuffer.remove(refseq); - } - - List iToRemove = new ArrayList(); - for ( GenomeLoc interval : intervalBuffer.keySet() ) { - if ( interval.isBefore(ref.getLocus())) { - writeOut(interval,intervalBuffer.get(interval)); - iToRemove.add(interval); - } - } - - for ( GenomeLoc interval : iToRemove) { - intervalBuffer.remove(interval); - } - - for ( Map.Entry entry : currentBedFeatures.entrySet() ) { - GenomeLoc entryLoc = getToolkit().getGenomeLocParser().createGenomeLoc(entry.getValue().getChr(),entry.getValue().getStart(),entry.getValue().getEnd()); - if ( entryLoc.isBefore(ref.getLocus()) ) { - currentBedFeatures.remove(entry.getKey()); - } - } - } - - public void writeOut(GenomeLoc interval, IntervalInfoBuilder info) { - out.printf("%s\t%d\t%d\t%s%n",interval.getContig(),interval.getStart(),interval.getStop(),info.toString()); - } - - public Long reduceInit() { - return 0l; - } - - public Long reduce(Long map, Long prevRed) { - if ( map == null ) { - return prevRed; - } - - return map + prevRed; - } - - public void onTraversalDone(Long l) { - // finish out the stuff in the buffer - for ( GenomeLoc loc : intervalBuffer.keySet() ) { - out.printf("%s\t%d\t%d\t%s%n",loc.getContig(),loc.getStart(),loc.getStop(),"Unknown"); - } - } -} - -class IntervalInfoBuilder { - // container class -- holds information pertinent to an info (genes, exons, etc) - public List geneNames; - public Map> exonsByGene; - public Map> exonNumbersByGene; - - public IntervalInfoBuilder() { - geneNames = new ArrayList(); - exonsByGene = new HashMap>(); - exonNumbersByGene = new HashMap>(); - } - - public void update(String gene, List exons, List exonNumbers) { - if ( geneNames.contains(gene) ) { - if ( gene.startsWith("gene") ) { - // exons are split up one per bed, so update the exon list for this gene - for ( int eOff = 0; eOff < exons.size(); eOff++) { - if ( ! exonNumbersByGene.get(gene).contains( exonNumbers.get(eOff) ) ) { - exonsByGene.get(gene).add(exons.get(eOff)); - exonNumbersByGene.get(gene).add(exonNumbers.get(eOff)); - } - } - } else { - throw new ReviewedStingException("Attempting to update an IntervalInfoBuilder twice with the same (non-TCGA) gene: "+gene); - } - } else { - - geneNames.add(gene); - exonsByGene.put(gene,exons); - exonNumbersByGene.put(gene,exonNumbers); - - } - } - - public String toString() { - StringBuffer buf = new StringBuffer(); - - if ( geneNames.size() == 0 ) { - buf.append("Unknown"); - } - - for ( int geneIndex = 0; geneIndex < geneNames.size(); geneIndex++) { - if ( geneIndex > 0 ) { - buf.append("\t"); - } - buf.append(geneNames.get(geneIndex)); - if ( ! geneNames.get(geneIndex).startsWith("gene")) { - buf.append("["); - if ( exonsByGene.get(geneNames.get(geneIndex)).size() > 0 ) { - for ( int exonIndex = 0; exonIndex < exonsByGene.get(geneNames.get(geneIndex)).size(); exonIndex++ ) { - if ( exonIndex > 0 ) { - buf.append(','); - } - buf.append(String.format("exon_%d",exonNumbersByGene.get(geneNames.get(geneIndex)).get(exonIndex))); - } - } else { - buf.append("Intron/UTR"); - } - buf.append("]"); - } - } - - return buf.toString(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java deleted file mode 100644 index 529b78aca..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.oneoffprojects.utils.ReadPair; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import net.sf.samtools.SAMRecord; - -import java.util.Map; -import java.util.HashMap; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 13, 2010 - * Time: 12:45:42 PM - * To change this template use File | Settings | File Templates. - */ - -@WalkerName("DetectWGA") -@Requires(value={DataSource.REFERENCE_BASES}) - -public class DetectWGAWalker extends ReadWalker { - - private int TIP_LENGTH = 10; - private int TIP_MM_THRESHOLD = 1; - private double TIP_AV_QUAL_THRESHOLD = 15.0; - - private boolean DEBUG = true; - - Map pairCache = null; - Map fragmentSizeMap = null; // by library - private ReferenceDataSource refData; - private byte[] refBases; - - - - - @Override - public void initialize() { - refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); - pairCache = new HashMap(); - fragmentSizeMap = new HashMap(); - } - - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - if ( ! read.getReadPairedFlag() ) return 0; // for now!! - - // read is paired - - cacheReadAsPair(read); - - /* - // if the read is already mapped (uniquely), we check if it may have the "colored tip" artifact on either side: - if ( AlignmentUtils.isReadUniquelyMapped(read) ) { - - TipInfo tips = countTipMismatches(read,TIP_LENGTH); - - if ( tips.leftMM() >= TIP_MM_THRESHOLD || tips.rightMM() >= TIP_MM_THRESHOLD ) { - if ( DEBUG ) { - out.println(" Read "+read.getReadName()+ " has "+tips.leftMM()+"/"+tips.rightMM()+" mismatches in the tips"); - out.println(" Pair orientation: "+pair.getPairType()); - } - // try adding read to existing assemblies: - AlignmentInfo al = alignToAllAddToBest(read,Math.min(3,tips.leftMM()+tips.rightMM())-1); - if ( al == null ) { - if ( tips.leftMM() >= TIP_MM_THRESHOLD && tips.leftQ() >= TIP_AV_QUAL_THRESHOLD || - tips.rightMM() >= TIP_MM_THRESHOLD && tips.rightQ() >= TIP_AV_QUAL_THRESHOLD ) { - if ( DEBUG ) out.println(" Initialized new assembly.") ; - Assembly a = new Assembly(read.getReadBases(),read.getReadName(),read.getAlignmentStart()); - tryAndAddUnmapped(a); // see if we got unmapped reads that would align nicely - assemblies.add(a); - } - } - } - return 1; - } -*/ - - - - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - /** little helper: if we already cached the pair object for this read, just add the read to that object; if we did not - instantiate - * new pair objetc first and register it in the map, then add the read; this method also updates other cache(s)/trackers as needed, e.g. - * fragment size map - */ - private void cacheReadAsPair(SAMRecord read) { - ReadPair pair = pairCache.get( read.getReadName() ); - if ( pair == null ) { - pair = new ReadPair(read); - pairCache.put(read.getReadName(),pair); - } - - pair.addRead(read); - - // if it's a good pair, add its fragment size to the stats: - if ( pair.hasBothEnds() && pair.bothEndsMapped() && pair.isProper() ) { - String lib = read.getReadGroup().getLibrary(); - MathUtils.RunningAverage fSize = fragmentSizeMap.get(lib); - if ( fSize == null ) { - fSize = new MathUtils.RunningAverage(); - fragmentSizeMap.put(lib,fSize); - } - fSize.add(pair.getFragmentSize()); - } - - } - - private TipInfo countTipMismatches(SAMRecord read, int tip_length) { - - AlignmentUtils.MismatchCount left_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,0,tip_length); - - int right_start = read.getReadLength()-tip_length; - AlignmentUtils.MismatchCount right_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,right_start,read.getReadLength()-right_start); - - return new TipInfo(left_mm,right_mm); - } - - class TipInfo { - AlignmentUtils.MismatchCount left_mm; - AlignmentUtils.MismatchCount right_mm; - double left_avQ; - double right_avQ; - - public TipInfo(AlignmentUtils.MismatchCount l,AlignmentUtils.MismatchCount r) { - left_mm = l; - right_mm = r; - left_avQ = (l.numMismatches ==0 ? 0 : ((double)l.mismatchQualities)/l.numMismatches ); - right_avQ = (r.numMismatches ==0 ? 0 : ((double)r.mismatchQualities)/r.numMismatches ); - } - - public int leftMM() { return left_mm.numMismatches; } - public int rightMM() { return right_mm.numMismatches; } - public double leftQ() { return left_avQ; } - public double rightQ() { return right_avQ; } - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java deleted file mode 100644 index f50e33550..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.samread.SAMReadFeature; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.commandline.Argument; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Arrays; - -import net.sf.samtools.SAMRecord; - -/** - * Checks a given downsampled pileup against the full pileup to ensure that the downsampled pileup could - * possibly be a valid version of the full pileup. - * - * @author mhanna - * @version 0.1 - */ -public class DownsamplingValidationWalker extends LocusWalker { - @Argument(fullName="max_expected_number_of_reads",shortName="menr",doc="The expected number of reads chosed by the downsampler. Fewer than this number might be added to a given alignment start, but more than this should never be.",required=true) - private int maxExpectedNumberOfReads = 0; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - ReadBackedPileup pileup = context.getBasePileup(); - Collection allFeatures = tracker.getReferenceMetaData("reads"); - - Collection unsampledReadsStartingAtThisLocus = new ArrayList(); - for(Object featureCandidate: allFeatures) { - if(featureCandidate instanceof SAMReadFeature) { - SAMReadFeature feature = (SAMReadFeature)featureCandidate; - if(feature.getReferenceName().equals(ref.getLocus().getContig()) && feature.getAlignmentStart() == ref.getLocus().getStart()) - unsampledReadsStartingAtThisLocus.add(feature); - } - } - Collection sampledReadsStartingAtThisLocus = new ArrayList(); - for(SAMRecord read: pileup.getReads()) { - if(read.getReferenceName().equals(ref.getLocus().getContig()) && read.getAlignmentStart() == ref.getLocus().getStart()) - sampledReadsStartingAtThisLocus.add(read); - } - - int matchingReadsFound = 0; - if(unsampledReadsStartingAtThisLocus.isEmpty()) { - if(!sampledReadsStartingAtThisLocus.isEmpty()) - throw new ReviewedStingException("Downsampler hallucinated a read starting at locus "+ref.getLocus()); - } - else { - boolean foundMatch = false; - for(SAMReadFeature unsampledRead: unsampledReadsStartingAtThisLocus) { - for(SAMRecord sampledRead: sampledReadsStartingAtThisLocus) { - if(unsampledRead.getReadName().equals(sampledRead.getReadName()) && - Arrays.equals(unsampledRead.getReadBases(),sampledRead.getReadBases())) { - foundMatch = true; - matchingReadsFound++; - } - } - } - - if(!foundMatch) - throw new ReviewedStingException("Downsampler failed to include any read starting at locus "+ref.getLocus()); - - if(matchingReadsFound > maxExpectedNumberOfReads) - throw new ReviewedStingException("Downsampler found too many reads starting at locus "+ref.getLocus()); - } - - return matchingReadsFound; - } - - // Given result of map function - public Long reduceInit() { return 0L; } - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - public Long treeReduce(Long lhs, Long rhs ) { - return lhs+rhs; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ErrorRatePerReadPosition.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ErrorRatePerReadPosition.java deleted file mode 100755 index adf5173ce..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ErrorRatePerReadPosition.java +++ /dev/null @@ -1,162 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import java.util.HashMap; -import java.util.HashSet; -import java.io.PrintStream; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; - -/** - * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine) - */ -public class ErrorRatePerReadPosition extends LocusWalker { - @Output PrintStream out; - @Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling (default: 0)", required=false) public Integer MIN_BASE_QUAL = 0; - @Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling (default: 0)", required=false) public Integer MIN_MAPPING_QUAL = 0; - - private HashMap mismatches; - private HashMap counts; - private HashMap quals; - private HashMap> readLengthsPerReadGroup; - private HashSet readLengths; - private int readLength = 10000; - - public void initialize() { - mismatches = new HashMap(); - counts = new HashMap(); - quals = new HashMap(); - readLengthsPerReadGroup = new HashMap>(); - readLengths = new HashSet(); - - for (SAMReadGroupRecord rg : this.getToolkit().getSAMFileHeader().getReadGroups()) { - int[] mm = new int[readLength]; - int[] cm = new int[readLength]; - int[] qm = new int[readLength]; - - mismatches.put(rg.getReadGroupId(), mm); - counts.put(rg.getReadGroupId(), cm); - quals.put(rg.getReadGroupId(), qm); - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List offsets = context.getOffsets(); - List reads = context.getReads(); - - for (int i = 0; i < offsets.size(); i++) { - int offset = offsets.get(i); - - if (reads.get(i).getMappingQuality() >= MIN_MAPPING_QUAL && - reads.get(i).getBaseQualities()[offset] >= MIN_BASE_QUAL) { - - char readBase = reads.get(i).getReadString().charAt(offset); - - int refIndex = ref.getBaseIndex(); - int readIndex = BaseUtils.simpleBaseToBaseIndex(readBase); - - if (!reads.get(i).getReadNegativeStrandFlag() && (!reads.get(i).getReadPairedFlag() || reads.get(i).getFirstOfPairFlag())) { - String keyName = reads.get(i).getReadGroup().getReadGroupId(); - - mismatches.get(keyName)[offset] += (refIndex != readIndex) ? 1 : 0; - counts.get(keyName)[offset]++; - quals.get(keyName)[offset] += reads.get(i).getBaseQualities()[offset]; - - int readLength = reads.get(i).getReadLength(); - if (!readLengthsPerReadGroup.containsKey(keyName)) { - readLengthsPerReadGroup.put(keyName, new HashSet()); - } - - readLengthsPerReadGroup.get(keyName).add(readLength); - readLengths.add(readLength); - } - } - } - - return null; - } - - public Integer reduceInit() { - return null; - } - - public Integer reduce(Integer value, Integer sum) { - return null; - } - - public void onTraversalDone(Integer sum) { - String[] rgs = mismatches.keySet().toArray(new String[1]); - - out.printf("position"); - for (String rg : rgs) { out.printf("\t%s", rg); } - //out.printf("\tmean\tsd\tmin\tmax"); - - //for (int readLength : readLengths) { - // out.printf("\t%dreadLength", readLength); - //} - - out.println(); - - for (int i = 0; i < readLength; i++) { - boolean print = false; - String row = ""; - - row += String.format("%d", i); - - double rsum = 0.0; - double min = 1.00; - double max = 0.00; - - for (String rg : rgs) { - double value = ((double) mismatches.get(rg)[i])/((double) counts.get(rg)[i]); - //double value = ((double) quals.get(rg)[i])/((double) counts.get(rg)[i]); - - if (Double.isInfinite(value) || Double.isNaN(value)) { - value = 0.0; - } else { - print = true; - } - - row += String.format("\t%f", value); - - rsum += value; - if (value > max) { max = value; } - if (value < min) { min = value; } - } - - double mean = rsum/rgs.length; - - double squareDeviationSum = 0.0; - for (String rg : rgs) { - double value = ((double) mismatches.get(rg)[i])/((double) counts.get(rg)[i]); - - if (Double.isInfinite(value) || Double.isNaN(value)) { - value = 0.0; - } else { - print = true; - } - - squareDeviationSum += Math.pow(value - mean, 2.0); - } - - double sd = Math.sqrt(squareDeviationSum/rgs.length); - - //row += String.format("\t%f\t%f\t%f\t%f", mean, sd, min, max); - - row += String.format("%n"); - - if (print) { - out.print(row); - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GenotypeConcordanceTable.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GenotypeConcordanceTable.java deleted file mode 100755 index d4b73cb59..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GenotypeConcordanceTable.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; - -import java.io.PrintStream; -import java.util.*; - -/** - * Emits a table of chrom/pos/sample/GQ/LoDGivenXX for AA, AB, BB/called genotypes for eval and comp for eval and comp VCFs - */ -@Requires(value={}) -public class GenotypeConcordanceTable extends RodWalker { - public static final String EVAL_NAME = "eval"; - public static final String COMP_NAME = "comp"; - - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(doc="If provided, we will include information where EVAL is missing and COMP is missing", required=false) - protected boolean keepDoubleMissing = false; - - @Argument(doc="If provided, we will include information where EVAL is no-called", required=false) - protected boolean keepEvalNoCall = false; - - @Argument(doc="If provided, we will include information where COMP is no-called", required=false) - protected boolean keepCompNoCall = false; - - Set samples; - - @Override - public void initialize() { - samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), Arrays.asList(EVAL_NAME)); - logger.info("Samples: " + samples); - List fields = Arrays.asList("chrom", "pos", "sample", "GQ", "LofDGivenAA", "LofDGivenAB", "LofDGivenBB", "concordant", "concordantInt", EVAL_NAME, COMP_NAME); - out.println(Utils.join("\t", fields)); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - if ( tracker.getNBoundRodTracks() > 0 ) { - for ( String sample : samples ) { - Genotype evalGT = getGenotype(tracker, ref, sample, EVAL_NAME); - Genotype compGT = getGenotype(tracker, ref, sample, COMP_NAME); - - if ( evalGT == null && compGT == null && ! keepDoubleMissing ) - continue; - - if ( (evalGT == null || evalGT.isNoCall()) && ! keepEvalNoCall ) - continue; - - if ( (compGT == null || compGT.isNoCall()) && ! keepCompNoCall ) - continue; - - out.printf("%s\t%s\t%s\t", ref.getLocus().getContig(), ref.getLocus().getStart(), sample); - String evalGQ = evalGT == null ? "NA" : String.format("%d", (int)(10*evalGT.getNegLog10PError())); - - String LofDGivenAA = "-1", LofDGivenAB = "-1", LofDGivenBB = "-1"; - if ( evalGT != null ) { - double[] pls = evalGT.getLikelihoods().getAsVector(); - if ( pls != null ) { // not missing - LofDGivenAA = String.format("%.0f", pls[0]); - LofDGivenAB = String.format("%.0f", pls[1]); - LofDGivenBB = String.format("%.0f", pls[2]); - } - } - - String concordance = evalGT == null || compGT == null ? "NA" : String.format("%s", evalGT.getType() == compGT.getType()); - String concordanceInt = Integer.toString(evalGT == null || compGT == null ? -1 : (evalGT.getType() == compGT.getType() ? 1 : 0)); - String evalType = evalGT == null ? "MISSING" : evalGT.getType().toString(); - String compType = compGT == null ? "MISSING" : compGT.getType().toString(); - out.println(Utils.join("\t", Arrays.asList(evalGQ, LofDGivenAA, LofDGivenAB, LofDGivenBB, concordance, concordanceInt, evalType, compType))); - } - } - - return 1; - } - - private Genotype getGenotype(RefMetaDataTracker tracker, ReferenceContext ref, String sample, String rod) { - for ( VariantContext vc : tracker.getVariantContexts(ref, rod, null, ref.getLocus(), true, false) ) { - if ( vc.isNotFiltered() && vc.hasGenotype(sample) ) - return vc.getGenotype(sample); - else - return null; - } - - return null; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - @Override - public void onTraversalDone(Integer sum) {} -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java deleted file mode 100644 index cea131cd9..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileHeader; - -import java.util.ArrayList; - -/** - * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the - * read to its home on the reference and noting the mismatch rate. It ignores reads with - * indels in them, treats high and low-quality references bases the same, and does not count - * ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads - * in short order. - * - * @author Kiran Garimella - */ -public class IOCrusherWalker extends ReadWalker> { - @Argument(shortName="nWaysOut",doc="n ways out",required=false) - public int nWaysOut = 1; - - @Argument(shortName="readScaling",doc="read scaling",required=false) - public float readScaling = 1; - - @Argument(shortName="outputBase",doc="output base",required=true) - public String outputBase; - - @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files", required = false) - public Integer BAMcompression = 5; - - public long nReadsRead = 0; - public long nReadsWritten = 0; - - /** - * - */ - public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - nReadsRead++; - return read; - } - - /** - * - */ - public ArrayList reduceInit() { - ArrayList outputs = new ArrayList(nWaysOut); - for ( int i = 0; i < nWaysOut; i++ ) { - SAMFileHeader header = this.getToolkit().getSAMFileHeader(); - outputs.add(ReadUtils.createSAMFileWriterWithCompression(header, true, outputBase + "." + i + ".bam", BAMcompression)); - } - return outputs; - } - - /** - * Summarize the error rate data. - * - */ - public ArrayList reduce(SAMRecord read, ArrayList outputs) { - for ( SAMFileWriter out : outputs ) { - if ( readScaling >= 1.0 ) { - int nCopies = (int)Math.ceil(readScaling); - for ( int i = 0; i < nCopies; i++) { - out.addAlignment(read); - nReadsWritten++; - } - } else if ( Math.random() < readScaling ) { - out.addAlignment(read); - nReadsWritten++; - } - } - - return outputs; - } - - /** - * - */ - public void onTraversalDone(ArrayList outputs) { - for ( SAMFileWriter out : outputs ) { - out.close(); - } - System.out.printf("Reads: read %d written %d%n", nReadsRead, nReadsWritten); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java deleted file mode 100644 index 399a3e6df..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java +++ /dev/null @@ -1,196 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.File; -import java.util.*; - -public class IndelAnnotator extends RodWalker { - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName="refseq", shortName="refseq", doc="Name of RefSeq transcript annotation file", required=true) - String RefseqFileName = null; - - private SeekableRODIterator refseqIterator; - - public void initialize() { - if ( RefseqFileName != null ) { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - - refseqIterator = new SeekableRODIterator(refseq.getHeader(), - refseq.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - refseq.getIterator()); - - logger.info("Using RefSeq annotations from " + RefseqFileName); - } - - if ( refseqIterator == null ) logger.info("No annotations available"); - - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "IndelAnnotator")); - hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); - - HashSet anno = new HashSet(); - anno.add(new VCFInfoHeaderLine("cDNAchange", 1, VCFHeaderLineType.String, "cDNAchange")); - anno.add(new VCFInfoHeaderLine("classification", 1, VCFHeaderLineType.String, "classification")); - anno.add(new VCFInfoHeaderLine("codonchange", 1, VCFHeaderLineType.String, "codonchange")); - anno.add(new VCFInfoHeaderLine("gene", 1, VCFHeaderLineType.String, "gene")); - anno.add(new VCFInfoHeaderLine("genomechange", 1, VCFHeaderLineType.String, "genomechange")); - anno.add(new VCFInfoHeaderLine("proteinchange", 1, VCFHeaderLineType.String, "proteinchange")); - anno.add(new VCFInfoHeaderLine("strand", 1, VCFHeaderLineType.String, "strand")); - anno.add(new VCFInfoHeaderLine("transcript", 1, VCFHeaderLineType.String, "transcript")); - anno.add(new VCFInfoHeaderLine("type", 1, VCFHeaderLineType.String, "type")); - hInfo.addAll(anno); - - VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())); - vcfWriter.writeHeader(vcfHeader); - } - - public Long reduceInit() { - return 0l; - } - - private TreeMap getAnnotationMap(VariantContext vc, VariantContext dbsnp, RODRecordList ann) { - TreeMap anns = new TreeMap(); - anns.put("gene", "---"); - anns.put("type", "IGR"); - anns.put("transcript", "---"); - anns.put("strand", "+"); - anns.put("proteinchange", "---"); - anns.put("genomechange", "---"); - anns.put("codonchange", "---"); - anns.put("cDNAchange", "---"); - - if (dbsnp != null) { - anns.put("ID", dbsnp.getAttribute("ID")); - } - - if (vc.isIndel()) { - anns.put("classification", vc.isInsertion() ? "INS" : "DEL"); - } - - if ( ann != null ) { - TreeMap> deleteriousnessRankedAnnotations = new TreeMap>(); - - for (int transcriptIndex = 0; transcriptIndex < ann.size(); transcriptIndex++) { - Transcript t = (Transcript) ann.get(transcriptIndex).getUnderlyingObject(); - TreeMap plausibleAnnotations = new TreeMap(); - Integer rank = 0; - - plausibleAnnotations.put("gene", t.getGeneName()); - plausibleAnnotations.put("transcript", t.getTranscriptId()); - plausibleAnnotations.put("strand", t.getStrand() == -1 ? "-" : "+"); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) { - //b.append(annCoding); // both exon and coding = coding exon sequence - if (vc.getIndelLengths().get(0) % 3 == 0) { - plausibleAnnotations.put("type", "Non-frameshift"); - rank = 4; - } else { - plausibleAnnotations.put("type", "Frameshift"); - rank = 0; - } - } else { - //b.append(annUTR); // exon but not coding = UTR - if (t.getStrand() == 1) { - plausibleAnnotations.put("type", "5'-UTR"); - rank = 2; - } else { - plausibleAnnotations.put("type", "3'-UTR"); - rank = 3; - } - } - } else { - if ( RefSeqFeature.isCoding(ann) ) { - //b.append(annIntron); // not in exon, but within the coding region = intron - GenomeLoc ig = getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); - GenomeLoc cl = t.getCodingLocation(); - GenomeLoc g = t.getLocation(); - - boolean spliceSiteDisruption = false; - - for (GenomeLoc exon : t.getExons()) { - GenomeLoc expandedExon = getToolkit().getGenomeLocParser().createGenomeLoc(exon.getContig(), exon.getStart() - 6, exon.getStop() + 6); - - if (ig.overlapsP(expandedExon)) { - spliceSiteDisruption = true; - } - } - - if (spliceSiteDisruption) { - plausibleAnnotations.put("type", "SpliceSiteDisruption"); - rank = 1; - } else { - plausibleAnnotations.put("type", "Intron"); - rank = 5; - } - } else { - //b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - plausibleAnnotations.put("type", "Unknown"); - rank = 6; - } - } - - deleteriousnessRankedAnnotations.put(rank, plausibleAnnotations); - } - - anns.putAll(deleteriousnessRankedAnnotations.firstEntry().getValue()); - } - - - return anns; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { - if ( tracker == null ) { return 0; } - - VariantContext vc = tracker.getVariantContext(ref, "variant", null, con.getLocation(), true); - if ( vc == null ) { return 0; } - - Collection dbsnps = tracker.getVariantContexts(ref, "dbsnp", null, con.getLocation(), true, true); - VariantContext dbsnp = null; - - if (dbsnps != null && dbsnps.size() > 0) { - ArrayList dbsnpsarray = new ArrayList(dbsnps); - dbsnp = dbsnpsarray.get(0); - } - - RODRecordList annotationList = refseqIterator.seekForward(ref.getLocus()); - TreeMap annotationMap = getAnnotationMap(vc, dbsnp, annotationList); - - Map attrs = new HashMap(vc.getAttributes()); - attrs.putAll(annotationMap); - - vc = VariantContext.modifyAttributes(vc, attrs); - vcfWriter.add(vc, ref.getBase()); - - return 1; - } - - public Long reduce(Integer i, Long j) { - return i + j; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelConsistencyReadCounter.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelConsistencyReadCounter.java deleted file mode 100755 index 1c7a2d8e4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelConsistencyReadCounter.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -@By(DataSource.READS) -@Requires(value={},referenceMetaData=@RMD(name="indels", type=VariantContext.class)) -// walker to count reads that are and are not consistent with homozygous indels -public class IndelConsistencyReadCounter extends ReadWalker { - - private long consistentReads = 0, misalignedReads = 0; - - public boolean filter(ReferenceContext ref, SAMRecord read) { - return !doNotTryToClean(read); - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - VariantContext indel = null; - for ( Collection rods : metaDataTracker.getContigOffsetMapping().values() ) { - Iterator rodIter = rods.iterator(); - while ( rodIter.hasNext() ) { - Object rod = rodIter.next().getUnderlyingObject(); - if ( VariantContextAdaptors.canBeConvertedToVariantContext(rod)) { - VariantContext vc = VariantContextAdaptors.toVariantContext("", rod, ref); - if ( vc.getSource().equals("indels") ) { - indel = vc; - break; - } - } - } - } - - if ( indel != null ) { - if ( read.getAlignmentEnd() == indel.getStart() ) - return 0; - - if ( !containsAnyIndel(read) || !containsIndel(read, indel) ) - misalignedReads++; - else - consistentReads++; - } - - return 0; - } - - private boolean doNotTryToClean(SAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - (BadMateFilter.hasBadMate(read)); - } - - private static boolean containsAnyIndel(final SAMRecord r) { - final Cigar cigar = r.getCigar(); - if ( cigar == null ) - return false; - - for ( final CigarElement e : cigar.getCigarElements() ) { - if (e.getOperator() == CigarOperator.D || e.getOperator() == CigarOperator.I ) - return true; - } - - return false; - } - - private static boolean containsIndel(final SAMRecord r, final VariantContext vc) { - int indelStart = vc.getStart() + 1; - int readPos = r.getAlignmentStart(); - - if ( vc.isInsertion() && indelStart == readPos ) - return true; - - final Cigar cigar = r.getCigar(); - - int idx = 0; - while ( readPos < indelStart && idx < cigar.numCigarElements() ) { - - final CigarElement ce = cigar.getCigarElement(idx); - switch ( ce.getOperator() ) { - case M: - case I: - readPos += ce.getLength(); - break; - default: - break; - } - - idx++; - } - - if ( idx == cigar.numCigarElements() ) - return false; - - if ( readPos != indelStart ) - return false; - - final CigarElement ce = cigar.getCigarElement(idx); - if ( vc.isDeletion() ) - return ( ce.getOperator() == CigarOperator.D && ce.getLength() == vc.getReference().getBases().length); - return ( ce.getOperator() == CigarOperator.I && ce.getLength() == vc.getAlternateAllele(0).getBases().length); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - System.out.println(consistentReads + " reads were initially consistent"); - System.out.println(misalignedReads + " reads were initially misaligned"); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java deleted file mode 100755 index 3a50fbe4d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import org.broadinstitute.sting.commandline.Gatherer; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.HashMap; -import java.util.List; -import java.util.regex.Pattern; - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 3/29/11 - * Time: 3:54 PM - * To change this template use File | Settings | File Templates. - */ - - -public class CountCovariatesGatherer extends Gatherer { - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - private static final String EOF_MARKER = "EOF"; - - private HashMap dataMap; - - - private void addCSVData (String line) { - String[] covariates = line.split(","); - String key = ""; - RecalDatumOptimized values; - - for (int i = 0; i < covariates.length-3; i++) { - key += covariates[i] + ","; - } - - values = new RecalDatumOptimized(Integer.parseInt(covariates[covariates.length-3]), - Integer.parseInt(covariates[covariates.length-2])); - - if (dataMap.get(key) != null) { - RecalDatumOptimized currentValues = dataMap.get(key); - values.increment(currentValues); - } - - dataMap.put(key, values); - } - - @Override - public void gather(List inputs, File output) { - dataMap = new HashMap(); - PrintStream o; - try { - o = new PrintStream(output); - } catch ( FileNotFoundException e) { - throw new UserException("File to be output by CountCovariates Gather function was not found"); - } - - boolean sawEOF = false; - boolean printedHeader = false; - - // Read input files - for ( File RECAL_FILE : inputs) { - try { - for ( String line : new XReadLines(RECAL_FILE) ) { - if ( EOF_MARKER.equals(line) ) { - sawEOF = true; // sanity check - } - else if(COMMENT_PATTERN.matcher(line).matches()) { - ; // It doesn't make any sense to print intermediate comments, unless we merge them somehow (would require strict definition for the header) - } - else if (COVARIATE_PATTERN.matcher(line).matches()) { - if (!printedHeader) - o.println(line); - } - else { // Found a line of data - addCSVData(line); // Parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } - - if ( !sawEOF ) { - final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!"; - throw new UserException.MalformedFile(RECAL_FILE, errorMessage); - } - printedHeader = true; - } - - // Write output file from dataMap - for(String key : dataMap.keySet()) { - RecalDatumOptimized values = dataMap.get(key); - String v = values.getNumObservations() + "," + values.getNumMismatches() + "," + values.empiricalQualByte(); - o.println(key + v); - } - o.println("EOF"); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java deleted file mode 100755 index c84494908..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 30, 2009 - * - * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases - * In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code. - * This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up. - */ - -public interface Covariate { - public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers - public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate - //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows - //read-specific calculations to be done just once rather than for each offset. -} - -interface RequiredCovariate extends Covariate { -} - -interface StandardCovariate extends Covariate { -} - -interface ExperimentalCovariate extends Covariate { -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java deleted file mode 100755 index 4e8565b6a..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java +++ /dev/null @@ -1,280 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; - -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.Arrays; -import java.util.List; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 30, 2009 - * - * The Cycle covariate. - * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) - * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle - * For example, for the read: AAACCCCGAAATTTTTACTG - * the cycle would be 11111111222333333344 - * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round - */ - -public class CycleCovariate implements StandardCovariate { - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || - RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) { - // nothing to do - } else { - throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid"); - } - } - } - - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - - int cycle = 1; - - //----------------------------- - // ILLUMINA and SOLID - //----------------------------- - - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" - read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" - cycle = offset + 1; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - offset; - } - } - - //----------------------------- - // 454 - //----------------------------- - - else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" - final byte[] bases = read.getReadBases(); - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - int iii = 0; - while( iii <= offset ) - { - while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; } - if( iii <= offset ) { cycle++; } - if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; } - - } - } else { // Negative direction - int iii = bases.length-1; - while( iii >= offset ) - { - while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; } - if( iii >= offset ) { cycle++; } - if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; } - } - } - } - - //----------------------------- - // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate - //----------------------------- - - //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { - // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - // int pos = offset + 1; - // if( read.getReadNegativeStrandFlag() ) { - // pos = read.getReadLength() - offset; - // } - // cycle = pos / 5; // integer division - //} - - //----------------------------- - // UNRECOGNIZED PLATFORM - //----------------------------- - - else { // Platform is unrecognized so revert to the default platform but warn the user first - if( defaultPlatform != null) { // The user set a default platform - if( !warnedUserBadPlatform ) { - Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "Defaulting to platform = " + defaultPlatform + "." ); - } - warnedUserBadPlatform = true; - - read.getReadGroup().setPlatform( defaultPlatform ); - return getValue( read, offset ); // A recursive call - } else { // The user did not set a default platform - throw new StingException( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "No default platform specified. Users must set the default platform using the --default_platform argument." ); - } - } - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { - cycle *= -1; - } - - return cycle; - } - */ - - // todo -- this should be put into a common place in the code base - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); - private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); - private static List SOLID_NAMES = Arrays.asList("SOLID"); - private static List LS454_NAMES = Arrays.asList("454"); - - private static boolean isPlatform(SAMRecord read, List names) { - String pl = read.getReadGroup().getPlatform().toUpperCase(); - for ( String name : names ) - if ( pl.contains( name ) ) - return true; - return false; - } - - // Used to pick out the covariate's value from attributes of the read - public void getValues(SAMRecord read, Comparable[] comparable) { - - //----------------------------- - // ILLUMINA and SOLID - //----------------------------- - - - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { - final int init; - final int increment; - if( !read.getReadNegativeStrandFlag() ) { - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { - //second of pair, positive strand - init = -1; - increment = -1; - } - else - { - //first of pair, positive strand - init = 1; - increment = 1; - } - - } else { - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { - //second of pair, negative strand - init = -read.getReadLength(); - increment = 1; - } - else - { - //first of pair, negative strand - init = read.getReadLength(); - increment = -1; - } - } - - int cycle = init; - for(int i = 0; i < read.getReadLength(); i++) { - comparable[i] = cycle; - cycle += increment; - } - } - else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" - - final int readLength = read.getReadLength(); - final byte[] bases = read.getReadBases(); - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - - int cycle = multiplyByNegative1 ? -1 : 1; - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - int iii = 0; - while( iii < readLength ) - { - while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } - if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } - - } - } else { // Negative direction - int iii = readLength-1; - while( iii >= 0 ) - { - while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } - if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } - } - } - } - else { - throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); - } - - - } - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java deleted file mode 100755 index 6af404154..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java +++ /dev/null @@ -1,72 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 16, 2009 - */ -//public class Dinuc implements Comparable{ -public class Dinuc implements Comparable{ - private byte first; - private byte second; - - public Dinuc() { - first = 0; - second = 0; - } - - public Dinuc(final byte _first, final byte _second) { - first = _first; - second = _second; - } - - public final void setValues(final byte _first, final byte _second) { - first = _first; - second = _second; - } - - public int compareTo(final Dinuc that) { - if( this.first > that.first ) { return 1; } - else if( this.first < that.first ) { return -1; } - else { //this.first equals that.first - if( this.second > that.second ) { return 1; } - else if( this.second < that.second ) { return -1; } - else { return 0; } - } - - } - - public static int hashBytes(final byte byte1, final byte byte2) { - return byte1 << 8 + byte2; - } - - public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file - byte[] byteArray = {first,second}; - return new String(byteArray); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java deleted file mode 100755 index 3b821b0e1..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 4, 2009 - * - * The Homopolymer Run Covariate. This is the number of consecutive bases in the previous N that match the current base. - * For example, if N = 10: - * ATTGCCCCGTAAAAAAAAATA - * 001001230001234567800 - */ - -public class HomopolymerCovariate implements ExperimentalCovariate { - - int numBack = 7; - - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - numBack = RAC.HOMOPOLYMER_NBACK; - } - - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - - // This block of code is for if you don't want to only count consecutive bases - // ATTGCCCCGTAAAAAAAAATA - // 001001231211234567819 - /* - int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read - int startPos = 0; - int stopPos = 0; - byte[] bases = read.getReadBases(); - byte thisBase = bases[offset]; - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - startPos = Math.max(offset - numBack, 0); - stopPos = Math.max(offset - 1, 0); - } else { // Negative direction - startPos = Math.min(offset + 2, bases.length); - stopPos = Math.min(offset + numBack + 1, bases.length); - } - - for( int iii = startPos; iii < stopPos; iii++ ) { - if( bases[iii] == thisBase ) { numAgree++; } - } - */ - - int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read - final byte[] bases = read.getReadBases(); - int iii = offset; - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) { - numAgree++; - iii++; - } - } else { // Negative direction - while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) { - numAgree++; - iii--; - } - } - - return numAgree; - } - - private void getContextHomopolymerLength(final byte[] refBytes, Comparable[] hrunArray) { - // compute forward hrun length, example: - // AGGTGACCCCCCTGAGAG - // 001000012345000000 - int runCount = 0; - hrunArray[0] = 0; - int[] hforward = new int[hrunArray.length]; - int[] hreverse = new int[hrunArray.length]; - - for (int i = 1; i < refBytes.length; i++) { - if (refBytes[i] == refBytes[i-1]) - hforward[i] = hforward[i-1]+1; - else - hforward[i] = 0; - } - - // do similar thing for reverse length, example: - // AGGTGACCCCCCTGAGAG - // 021000543210000000 - // and then accumulate with forward values. - // Total: - // AGGTGACCCCCCTGAGAG - // 022000555555000000 - for (int i=refBytes.length-1; i > 0; i--) { - if (refBytes[i-1] == refBytes[i]) - hreverse[i-1] += hreverse[i]+1; - } - - for (int i = 1; i < refBytes.length; i++) - hrunArray[i] = hforward[i]+hreverse[i]; - } - - public void getValues(SAMRecord read, Comparable[] comparable) { - - // getContextHomopolymerLength(read.getReadBases(), comparable); - for(int iii = 0; iii < read.getReadLength(); iii++) { - comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized - } - } - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java deleted file mode 100755 index 4e7bc06e4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.BaseUtils; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * The Reported Quality Score covariate. - */ - -public class IndelCountCovariate implements ExperimentalCovariate { - - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read ) { - - Cigar c = read.getCigar(); - - int indelCount = 0; - - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator() ) { - case D: - indelCount++; - break; - case I: - indelCount++; - break; - case M: - case N: - case S: - default: - break; - } - } - - return indelCount; - } - - public void getValues(SAMRecord read, Comparable[] comparable) { -/* Comparable numIndels = getValue(read); - for(int iii = 0; iii < read.getReadLength(); iii++) { - comparable[iii] = numIndels; // BUGBUG: this can be optimized - } */ - int ind = 0; - Cigar c = read.getCigar(); - System.out.println(c.toString()); - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - - switch( ce.getOperator() ) { - case D: - case I: - for (int k=0; k < ce.getLength(); k++) - comparable[ind++] = ce.getLength(); - break; - case M: - case N: - case S: - for (int k=0; k < ce.getLength(); k++) - comparable[ind++] = 0; - default: - break; - } - } - } - - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java deleted file mode 100755 index a6fcce290..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java +++ /dev/null @@ -1,638 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import org.broad.tribble.bed.BEDCodec; -import org.broad.tribble.dbsnp.DbSNPCodec; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.commandline.Gather; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -//import org.broadinstitute.sting.gatk.walkers.recalibration.*; -//import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; -import org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.NestedHashMap; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * This walker is designed to work as the first pass in a two-pass processing step. - * It does a by-locus traversal operating only at sites that are not in dbSNP. - * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality. - * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide) - * Since there is a large amount of data one can then calculate an empirical probability of error - * given the particular covariates seen at this site, where p(error) = num mismatches / num observations - * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score) - * The first non-comment line of the output file gives the name of the covariates that were used for this calculation. - * - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified - * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. - * - * @author rpoplin - * @since Nov 3, 2009 - * @help.summary First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). - */ - -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality -@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta -@PartitionBy(PartitionType.LOCUS) -// todo - merge with CountCovariates, all work is done, just need to port over -public class IndelCountCovariatesWalker extends LocusWalker implements TreeReducible { - - ///////////////////////////// - // Constants - ///////////////////////////// - private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped. - private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed. - private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord. - - ///////////////////////////// - // Shared Arguments - ///////////////////////////// - @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - @Output - PrintStream out; - - ///////////////////////////// - // Command Line Arguments - ///////////////////////////// - @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file") - @Gather(CountCovariatesGatherer.class) - public PrintStream RECAL_FILE; - - @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) - private boolean LIST_ONLY = false; - @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) - private String[] COVARIATES = null; - @Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) - private boolean USE_STANDARD_COVARIATES = false; - - @Argument(fullName="count_indels", shortName="indels", doc="Count covariates at indel sites", required=false) - private boolean COUNT_INDELS = false; - - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// - @Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.") - private boolean DONT_SORT_OUTPUT = false; - @Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") - private boolean RUN_WITHOUT_DBSNP = false; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private final ArrayList requestedCovariates = new ArrayList(); // A list to hold the covariate objects that were requested - private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) - private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) - - public static class CountedData { - private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file - private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file - private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file - private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base - private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. - - private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci - private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci - private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation - - /** - * Adds the values of other to this, returning this - * @param other - * @return this object - */ - public CountedData add(CountedData other) { - countedSites += other.countedSites; - countedBases += other.countedBases; - skippedSites += other.skippedSites; - solidInsertedReferenceBases += other.solidInsertedReferenceBases; - otherColorSpaceInconsistency += other.otherColorSpaceInconsistency; - dbSNPCountsMM += other.dbSNPCountsMM; - dbSNPCountsBases += other.dbSNPCountsBases; - novelCountsMM += other.novelCountsMM; - novelCountsBases += other.novelCountsBases; - lociSinceLastDbsnpCheck += other.lociSinceLastDbsnpCheck; - return this; - } - } - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { return COUNT_INDELS; } - - // enable extended events for indels - public boolean generateExtendedEvents() { return COUNT_INDELS; } - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Parse the -cov arguments and create a list of covariates to be used here - * Based on the covariates' estimates for initial capacity allocate the data hashmap - */ - public void initialize() { - - if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } - if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } - - // Get a list of all available covariates - final List> covariateClasses = new PluginManager( Covariate.class ).getPlugins(); - final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins(); - final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins(); - - // Print and exit if that's what was requested - if ( LIST_ONLY ) { - out.println( "Available covariates:" ); - for( Class covClass : covariateClasses ) { - out.println( covClass.getSimpleName() ); - } - out.println(); - - System.exit( 0 ); // Early exit here because user requested it - } - - // Warn the user if no dbSNP file or other variant mask was specified - boolean foundDBSNP = false; - for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { - if( rod != null ) { - if( rod.getType().equals(DbSNPCodec.class) || - rod.getType().equals(VCFCodec.class) || - rod.getType().equals(BEDCodec.class) ) { - foundDBSNP = true; - break; - } - } - } - if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) { - throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation."); - } - - // Initialize the requested covariates by parsing the -cov argument - // First add the required covariates - if( requiredClasses.size() == 2) { // readGroup and reported quality score - requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here - requestedCovariates.add( new QualityScoreCovariate() ); - } else { - throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); - } - // Next add the standard covariates if -standard was specified by the user - if( USE_STANDARD_COVARIATES ) { - // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order - // A list of Classes can't be sorted, but a list of Class names can be - final List standardClassNames = new ArrayList(); - for( Class covClass : standardClasses ) { - standardClassNames.add( covClass.getName() ); - } - Collections.sort(standardClassNames); // Sort the list of class names - for( String className : standardClassNames ) { - for( Class covClass : standardClasses ) { // Find the class that matches this class name - if( covClass.getName().equals( className ) ) { - try { - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - } - // Finally parse the -cov arguments that were provided, skipping over the ones already specified - if( COVARIATES != null ) { - for( String requestedCovariateString : COVARIATES ) { - boolean foundClass = false; - for( Class covClass : covariateClasses ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class - foundClass = true; - if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { - try { - // Now that we've found a matching class, try to instantiate it - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - - if( !foundClass ) { - throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); - } - } - } - - logger.info( "The covariates being used here: " ); - for( Covariate cov : requestedCovariates ) { - logger.info( "\t" + cov.getClass().getSimpleName() ); - cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection - } - -// try { -// stream = new PrintStream( RAC.RECAL_FILE ); -// } catch ( FileNotFoundException e ) { -// throw new RuntimeException( "Couldn't open output file: ", e ); -// } - } - - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * For each read at this locus get the various covariate values and increment that location in the map based on - * whether or not the base matches the reference at this particular location - * @param tracker The reference metadata tracker - * @param ref The reference context - * @param context The alignment context - * @return Returns 1, but this value isn't used in the reduce step - */ - public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - // Pull out data for this locus for all the input RODs and check if this is a known variant site in any of them - boolean isKnownVariant = false; - for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) { - if( vc != null ) { - isKnownVariant = true; - break; - } - } - - // Only use data from non-dbsnp sites - // Assume every mismatch at a non-dbsnp site is indicative of poor quality - CountedData counter = new CountedData(); - if( !isKnownVariant ) { - - if (COUNT_INDELS && context.hasExtendedEventPileup()) - { - - for ( ExtendedEventPileupElement p : context.getExtendedEventPileup().toExtendedIterable() ) { - - GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); - parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, p.getType(), true); - - } - } - else { - // For each read at this locus - for( PileupElement p : context.getBasePileup() ) { - GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); - parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, ExtendedEventPileupElement.Type.NOEVENT, false); - } - } - counter.countedSites++; - } else { // We skipped over the dbSNP site, and we are only processing every Nth locus - counter.skippedSites++; - updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable - } - - return counter; - } - - private void parsePileupElement(GATKSAMRecord gatkRead, int offset, ReferenceContext ref, CountedData counter, - RecalibrationArgumentCollection RAC, ExtendedEventPileupElement.Type type, boolean inExtendedPileup) { - - - if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { - return; - } - - if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) - { - gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); - RecalDataManager.parseSAMRecord( gatkRead, RAC ); - - // Skip over reads with no calls in the color space if the user requested it - if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { - gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); - return; - } - - RecalDataManager.parseColorSpace( gatkRead ); - gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, - RecalDataManager.computeCovariates( gatkRead, requestedCovariates )); - } - - // Skip this position if base quality is zero - boolean hasIndelAtThisPosition = false; - if (inExtendedPileup) { - // only count indel events in extended pileups - if (type.equals(ExtendedEventPileupElement.Type.NOEVENT)) - return; - - else - hasIndelAtThisPosition = true; - } else { - // in regular pileups we still get del bases: ignore now - if (offset < 0) - return; - } - - - if (COUNT_INDELS) { - if (offset < 0) { - // recompute offset in case of a deletion - offset = ref.getLocus().getStart() - gatkRead.getUnclippedStart(); - // further edge case: read starting w/insertion: ignore for now - if (offset < 0) return; - } - updateDataFromRead( counter, gatkRead, offset, ref.getBase(), hasIndelAtThisPosition ); - - } - else { - // Skip this position if base quality is zero - if( gatkRead.getBaseQualities()[offset] > 0 ) { - - byte[] bases = gatkRead.getReadBases(); - byte refBase = ref.getBase(); - - // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( bases[offset] ) ) { - - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || - !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { - - // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( counter, gatkRead, offset, refBase, hasIndelAtThisPosition ); - - } else { // calculate SOLID reference insertion rate - if( refBase == bases[offset] ) { - counter.solidInsertedReferenceBases++; - } else { - counter.otherColorSpaceInconsistency++; - } - } - } - } - } - } - - - /** - * Update the mismatch / total_base counts for a given class of loci. - * - * @param counter The CountedData to be updated - * @param context The AlignmentContext which holds the reads covered by this locus - * @param refBase The reference base - */ - private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) { - - if (context.hasExtendedEventPileup()){ - for( PileupElement p : context.getExtendedEventPileup() ) { - counter.novelCountsBases++; - } - return; - - } - - for( PileupElement p : context.getBasePileup() ) { - final byte readBase = p.getBase(); - final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase); - final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase); - - if( readBaseIndex != -1 && refBaseIndex != -1 ) { - if( readBaseIndex != refBaseIndex ) { - counter.novelCountsMM++; - } - counter.novelCountsBases++; - } - } - } - - /** - * Major workhorse routine for this walker. - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches - * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls - * because pulling things out of the SAMRecord is an expensive operation. - * @param counter Data structure which holds the counted bases - * @param gatkRead The SAMRecord holding all the data for this read - * @param offset The offset in the read for this locus - * @param refBase The reference base at this locus - */ - private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase, - final boolean hasIndelAtThisPosition) { - final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); - - if (offset < 0) { - int k=0; - } - final Object[] key = covars[offset]; - - // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap - final NestedHashMap data = dataManager.data; //optimization - create local reference - RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); - if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it - // initialized with zeros, will be incremented at end of method - datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key ); - } - - // Need the bases to determine whether or not we have a mismatch - final byte base = gatkRead.getReadBases()[offset]; - final long curMismatches = datum.getNumMismatches(); - - // Add one to the number of observations and potentially one to the number of mismatches - if (COUNT_INDELS) - datum.incrementBaseCounts(hasIndelAtThisPosition); - else - datum.incrementBaseCounts( base, refBase ); - - counter.countedBases++; - counter.novelCountsBases++; - counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable - } - - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker. - * @return returns A PrintStream created from the -recalFile filename argument specified to the walker - */ - public CountedData reduceInit() { - return new CountedData(); - } - - /** - * The Reduce method doesn't do anything for this walker. - * @param mapped Result of the map. This value is immediately ignored. - * @param sum The summing CountedData used to output the CSV data - * @return returns The sum used to output the CSV data - */ - public CountedData reduce( CountedData mapped, CountedData sum ) { - // Do a dbSNP sanity check every so often - return validatingDbsnpMismatchRate(sum.add(mapped)); - } - - /** - * Validate the dbSNP reference mismatch rates. - */ - private CountedData validatingDbsnpMismatchRate(CountedData counter) { - if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) { - counter.lociSinceLastDbsnpCheck = 0; - - if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { - final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; - final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; - - if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { - Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + - String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); - DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file - } - } - } - - return counter; - } - - public CountedData treeReduce( CountedData sum1, CountedData sum2 ) { - return validatingDbsnpMismatchRate(sum1.add(sum2)); - } - - /** - * Write out the full data hashmap to disk in CSV format - * @param sum The CountedData to write out to RECAL_FILE - */ - public void onTraversalDone( CountedData sum ) { - logger.info( "Writing raw recalibration data..." ); - outputToCSV( sum, RECAL_FILE ); - logger.info( "...done!" ); - } - - /** - * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format - * @param recalTableStream The PrintStream to write out to - */ - private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) { - recalTableStream.printf("# Counted Sites %d%n", sum.countedSites); - recalTableStream.printf("# Counted Bases %d%n", sum.countedBases); - recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites); - - if( sum.solidInsertedReferenceBases != 0 ) { - recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases); - recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency); - } - - // Output header saying which covariates were used and in what order - for( Covariate cov : requestedCovariates ) { - recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); - } - - if (COUNT_INDELS) - recalTableStream.println("nObservations,nTrueIndels,Qempirical"); - else - recalTableStream.println("nObservations,nMismatches,Qempirical"); - - if( DONT_SORT_OUTPUT ) { - printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); - } else { - printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); - } - - // print out an EOF marker - recalTableStream.println(TableRecalibrationWalker.EOF_MARKER); - } - - private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { - final ArrayList keyList = new ArrayList(); - for( Object comp : data.keySet() ) { - keyList.add((Comparable) comp); - } - - Collections.sort(keyList); - - for( Comparable comp : keyList ) { - key[curPos] = comp; - final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps - // For each Covariate in the key - for( Object compToPrint : key ) { - // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); - } - } - } - - private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { - for( Object comp : data.keySet() ) { - key[curPos] = comp; - final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps - // For each Covariate in the key - for( Object compToPrint : key ) { - // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappings( recalTableStream, curPos + 1, key, (Map) val ); - } - } - } -} - diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java deleted file mode 100755 index 571ff88e2..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: Jan 17, 2011 - * Time: 2:53:28 PM - * To change this template use File | Settings | File Templates. - */ -public class IndelPositionCovariate implements Covariate { - - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - int cycle = offset; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - (offset + 1); - } - return cycle; - } - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { - comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java deleted file mode 100755 index a2b479d40..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; -//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * The Reported Quality Score covariate. - */ - -public class QualityScoreCovariate implements RequiredCovariate { - - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return (int)(read.getBaseQualities()[offset]); - } - */ - - public void getValues(SAMRecord read, Comparable[] comparable) { - byte[] baseQualities = read.getBaseQualities(); - for(int i = 0; i < read.getReadLength(); i++) { -// comparable[i] = (int) baseQualities[i]; - comparable[i] = (int) 45; - } - } - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java deleted file mode 100755 index f250523cb..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import net.sf.samtools.SAMRecord; -//import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 30, 2009 - * - * The Read Group covariate. - */ - -public class ReadGroupCovariate implements RequiredCovariate { - - public static final String defaultReadGroup = "DefaultReadGroup"; - - // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getReadGroup().getReadGroupId(); - } - */ - - public void getValues(SAMRecord read, Comparable[] comparable) { - final String readGroupId = read.getReadGroup().getReadGroupId(); - for(int i = 0; i < read.getReadLength(); i++) { - comparable[i] = readGroupId; - } - } - - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return str; - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java deleted file mode 100755 index 6917089cc..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java +++ /dev/null @@ -1,649 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.NestedHashMap; - -import java.util.*; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMUtils; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 6, 2009 - * - * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. - * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. - * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration. - */ - -public class RecalDataManager { - - public final NestedHashMap data; // The full dataset - private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed - private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - private final ArrayList dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed - - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams - public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullReadGroup = false; - private static boolean warnUserNullPlatform = false; - - public enum SOLID_RECAL_MODE { - DO_NOTHING, - SET_Q_ZERO, - SET_Q_ZERO_BASE_N, - REMOVE_REF_BIAS - } - - public enum SOLID_NOCALL_STRATEGY { - THROW_EXCEPTION, - LEAVE_READ_UNRECALIBRATED, - PURGE_READ - } - - public RecalDataManager() { - data = new NestedHashMap(); - dataCollapsedReadGroup = null; - dataCollapsedQualityScore = null; - dataCollapsedByCovariate = null; - } - - public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker - data = null; - dataCollapsedReadGroup = new NestedHashMap(); - dataCollapsedQualityScore = new NestedHashMap(); - dataCollapsedByCovariate = new ArrayList(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NestedHashMap() ); - } - } else { - data = new NestedHashMap(); - dataCollapsedReadGroup = null; - dataCollapsedQualityScore = null; - dataCollapsedByCovariate = null; - } - } - - /** - * Add the given mapping to all of the collapsed hash tables - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping - * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table - */ - public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) { - - // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around - //data.put(key, thisDatum); // add the mapping to the main table - - final int qualityScore = Integer.parseInt( key[1].toString() ); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - RecalDatum collapsedDatum; - - // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) { - readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey ); - } else { - collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported - } - } - - // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... - qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); - } - - // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { - covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... - covariateCollapsedKey[1] = key[1]; // and quality score ... - final Object theCovariateElement = key[iii + 2]; // and the given covariate - if( theCovariateElement != null ) { - covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); - } - } - } - } - - /** - * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score - * that will be used in the sequential calculation in TableRecalibrationWalker - * @param smoothing The smoothing parameter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores - */ - public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { - - recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); - recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); - for( NestedHashMap map : dataCollapsedByCovariate ) { - recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); - checkForSingletons(map.data); - } - } - - private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) { - - for( Object comp : data.keySet() ) { - final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual); - } else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual); - } - } - } - - private void checkForSingletons( final Map data ) { - // todo -- this looks like it's better just as a data.valueSet() call? - for( Object comp : data.keySet() ) { - final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - if( data.keySet().size() == 1) { - data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... - // in a previous step of the sequential calculation model - } - } else { // Another layer in the nested hash map - checkForSingletons( (Map) val ); - } - } - } - - /** - * Get the appropriate collapsed table out of the set of all the tables held by this Object - * @param covariate Which covariate indexes the desired collapsed HashMap - * @return The desired collapsed HashMap - */ - public final NestedHashMap getCollapsedTable( final int covariate ) { - if( covariate == 0) { - return dataCollapsedReadGroup; // Table where everything except read group has been collapsed - } else if( covariate == 1 ) { - return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed - } - } - - /** - * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string - * @param read The read to adjust - * @param RAC The list of shared command line arguments - */ - public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - - SAMReadGroupRecord readGroup = read.getReadGroup(); - - // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if( readGroup == null ) { - if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { - if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + - " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); - } - } - - if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user - final String oldPlatform = readGroup.getPlatform(); - readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP ); - readGroup.setPlatform( oldPlatform ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); - } - - if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform( RAC.FORCE_PLATFORM ); - } - - if ( readGroup.getPlatform() == null ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( !warnUserNullPlatform ) { - Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); - warnUserNullPlatform = true; - } - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + - " Users must set the default platform using the --default_platform argument." ); - } - } - } - - /** - * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space - * @param read The SAMRecord to parse - */ - public static void parseColorSpace( final SAMRecord read ) { - - // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { - if( read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read - final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { - byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { - throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); - } - final byte[] inconsistency = new byte[readBases.length]; - int iii; - byte prevBase = colorSpace[0]; // The sentinel - for( iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); - inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 ); - prevBase = readBases[iii]; - } - read.setAttribute( org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency ); - - } else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - } - } - } - - /** - * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases - * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * @param read The SAMRecord to parse - * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read - * @return A new array of quality scores that have been ref bias corrected - */ - public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) { - - final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { - byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); - refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() ); - } - final int[] inconsistency = new int[readBases.length]; - byte prevBase = colorSpace[0]; // The sentinel - for( int iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); - colorImpliedBases[iii] = thisBase; - inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); - prevBase = readBases[iii]; - } - - // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0 - final boolean setBaseN = false; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) { - final boolean setBaseN = true; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); - } - - } else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - - return originalQualScores; - } - - public static boolean checkNoCallColorSpace( final SAMRecord read ) { - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { - final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { - byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel - } else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - for( byte color : colorSpace ) { - if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) { - return true; // There is a bad color in this SOLiD read and the user wants to skip over it - } - } - - } else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - } - - return false; // There aren't any color no calls in this SOLiD read - } - - /** - * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param originalQualScores The array of original quality scores to set to zero if needed - * @param refBases The reference which has been RC'd if necessary - * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar - * @return The byte array of original quality scores some of which might have been set to zero - */ - private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, - final byte[] refBases, final boolean setBaseN ) { - - final boolean negStrand = read.getReadNegativeStrandFlag(); - for( int iii = 1; iii < originalQualScores.length; iii++ ) { - if( inconsistency[iii] == 1 ) { - if( readBases[iii] == refBases[iii] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; } - else { originalQualScores[iii] = (byte)0; } - if( setBaseN ) { readBases[iii] = (byte)'N'; } - } - // Set the prev base to Q0 as well - if( readBases[iii-1] == refBases[iii-1] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; } - else { originalQualScores[iii-1] = (byte)0; } - if( setBaseN ) { readBases[iii-1] = (byte)'N'; } - } - } - } - if( negStrand ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases( readBases ); - - return originalQualScores; - } - - /** - * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param colorImpliedBases The bases implied by the color space, RC'd if necessary - * @param refBases The reference which has been RC'd if necessary - */ - private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, - final byte[] refBases) { - - final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); - if( attr != null ) { - byte[] colorSpaceQuals; - if( attr instanceof String ) { - String x = (String)attr; - colorSpaceQuals = x.getBytes(); - SAMUtils.fastqToPhred(colorSpaceQuals); - } else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - - for( int iii = 1; iii < inconsistency.length - 1; iii++ ) { - if( inconsistency[iii] == 1 ) { - for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read - if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step - if( readBases[jjj] == refBases[jjj] ) { - if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 ); - if( rand == 0 ) { // The color implied base won the coin flip - readBases[jjj] = colorImpliedBases[jjj]; - } - } else { - final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); - final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); - int diffInQuality = maxQuality - minQuality; - int numLow = minQuality; - if( numLow == 0 ) { - numLow++; - diffInQuality++; - } - final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh ); - if( rand >= numLow ) { // higher q score won - if( maxQuality == (int)colorSpaceQuals[jjj] ) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had higher q score, and won out, so nothing to do here - } else { // lower q score won - if( minQuality == (int)colorSpaceQuals[jjj] ) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had lower q score, and won out, so nothing to do here - } - } - } - } - } - } - } - - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases( readBases ); - } else { // No color space quality tag in file - throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); - } - } - - /** - * Given the base and the color calculate the next base in the sequence - * @param prevBase The base - * @param color The color - * @return The next base in the sequence - */ - private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) { - switch(color) { - case '0': - return prevBase; - case '1': - return performColorOne( prevBase ); - case '2': - return performColorTwo( prevBase ); - case '3': - return performColorThree( prevBase ); - default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); - } - } - - /** - * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * @param read The read which contains the color space to check against - * @param offset The offset in the read at which to check - * @return Returns true if the base was inconsistent with the color space - */ - public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { - final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); - if( attr != null ) { - final byte[] inconsistency = (byte[])attr; - // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if( read.getReadNegativeStrandFlag() ) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte)0; - } else { // Forward direction - return inconsistency[offset] != (byte)0; - } - - // This block of code is for if you want to check both the offset and the next base for color space inconsistency - //if( read.getReadNegativeStrandFlag() ) { // Negative direction - // if( offset == 0 ) { - // return inconsistency[0] != 0; - // } else { - // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); - // } - //} else { // Forward direction - // if( offset == inconsistency.length - 1 ) { - // return inconsistency[inconsistency.length - 1] != 0; - // } else { - // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); - // } - //} - - } else { // No inconsistency array, so nothing is inconsistent - return false; - } - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * @param gatkRead The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @return An array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - */ - public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { - //compute all covariates for this read - final List requestedCovariatesRef = requestedCovariates; - final int numRequestedCovariates = requestedCovariatesRef.size(); - final int readLength = gatkRead.getReadLength(); - - final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; - final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; - - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for( int i = 0; i < numRequestedCovariates; i++ ) { - requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder ); - for(int j = 0; j < readLength; j++) { - //copy values into a 2D array that allows all covar types to be extracted at once for - //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. - covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; - } - } - - return covariateValues_offset_x_covar; - } - - /** - * Perform a ceratin transversion (A <-> C or G <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transversion of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorOne(byte base) { - switch (base) { - case 'A': - case 'a': return 'C'; - case 'C': - case 'c': return 'A'; - case 'G': - case 'g': return 'T'; - case 'T': - case 't': return 'G'; - default: return base; - } - } - - /** - * Perform a transition (A <-> G or C <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transition of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorTwo(byte base) { - switch (base) { - case 'A': - case 'a': return 'G'; - case 'C': - case 'c': return 'T'; - case 'G': - case 'g': return 'A'; - case 'T': - case 't': return 'C'; - default: return base; - } - } - - /** - * Return the complement (A <-> T or C <-> G) of a base. - * - * @param base the base [AaCcGgTt] - * @return the complementary base, or the input base if it's not one of the understood ones - */ - private static byte performColorThree(byte base) { - switch (base) { - case 'A': - case 'a': return 'T'; - case 'C': - case 'c': return 'G'; - case 'G': - case 'g': return 'C'; - case 'T': - case 't': return 'A'; - default: return base; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java deleted file mode 100755 index acefc71dc..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java +++ /dev/null @@ -1,112 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. - */ - -public class RecalDatum extends RecalDatumOptimized { - - private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations - private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - - //--------------------------------------------------------------------------------------------------------------- - // - // constructors - // - //--------------------------------------------------------------------------------------------------------------- - - public RecalDatum() { - numObservations = 0L; - numMismatches = 0L; - estimatedQReported = 0.0; - empiricalQuality = 0.0; - } - - public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality ) { - numObservations = _numObservations; - numMismatches = _numMismatches; - estimatedQReported = _estimatedQReported; - empiricalQuality = _empiricalQuality; - } - - public RecalDatum( final RecalDatum copy ) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; - this.estimatedQReported = copy.estimatedQReported; - this.empiricalQuality = copy.empiricalQuality; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final void combine( final RecalDatum other ) { - final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); - this.increment( other.numObservations, other.numMismatches ); - this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations); - //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) { - this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again - } - - //--------------------------------------------------------------------------------------------------------------- - // - // misc. methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final double getEstimatedQReported() { - return estimatedQReported; - } - - public final double getEmpiricalQuality() { - return empiricalQuality; - } - - private double calcExpectedErrors() { - return (double)this.numObservations * qualToErrorProb( estimatedQReported ); - } - - private double qualToErrorProb( final double qual ) { - return Math.pow(10.0, qual / -10.0); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java deleted file mode 100755 index 2a91d02c4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java +++ /dev/null @@ -1,139 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.List; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Jan 6, 2010 - * - * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed. - * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. - */ - -public class RecalDatumOptimized { - - protected long numObservations; // number of bases seen in total - protected long numMismatches; // number of bases seen that didn't match the reference - - //--------------------------------------------------------------------------------------------------------------- - // - // constructors - // - //--------------------------------------------------------------------------------------------------------------- - - public RecalDatumOptimized() { - numObservations = 0L; - numMismatches = 0L; - } - - public RecalDatumOptimized( final long _numObservations, final long _numMismatches) { - numObservations = _numObservations; - numMismatches = _numMismatches; - } - - public RecalDatumOptimized( final RecalDatumOptimized copy ) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public synchronized final void increment( final long incObservations, final long incMismatches ) { - numObservations += incObservations; - numMismatches += incMismatches; - } - - public synchronized final void increment( final RecalDatumOptimized other ) { - increment( other.numObservations, other.numMismatches ); - } - - public synchronized final void increment( final List data ) { - for ( RecalDatumOptimized other : data ) { - this.increment( other ); - } - } - - public synchronized final void incrementBaseCounts( final byte curBase, final byte refBase ) { - increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches - } - - public synchronized final void incrementBaseCounts(boolean hasIndelAtThisPosition) { - increment(1, hasIndelAtThisPosition ? 1 : 0); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - public final double empiricalQualDouble( final int smoothing, final double maxQual ) { - final double doubleMismatches = (double) ( numMismatches + smoothing ); - final double doubleObservations = (double) ( numObservations + smoothing ); - double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - if (empiricalQual > maxQual) { empiricalQual = maxQual; } - return empiricalQual; - } - public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero - - public final byte empiricalQualByte( final int smoothing ) { - final double doubleMismatches = (double) ( numMismatches + smoothing ); - final double doubleObservations = (double) ( numObservations + smoothing ); - return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40 - } - public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero - - //--------------------------------------------------------------------------------------------------------------- - // - // misc. methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final long getNumObservations() { - return numObservations; - } - - public final long getNumMismatches() { - return numMismatches; - } - - public final String outputToCSV( ) { - return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); - } - public final String outputToCSV( final int smoothing ) { - return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte(smoothing) ); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java deleted file mode 100755 index 996446b35..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; - -import org.broadinstitute.sting.commandline.Argument; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 27, 2009 - * - * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. - * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. - */ - -public class RecalibrationArgumentCollection { - - ////////////////////////////////// - // Shared Command Line Arguments - ////////////////////////////////// - @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") - public String DEFAULT_READ_GROUP = null; - @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") - public String DEFAULT_PLATFORM = null; - @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") - public String FORCE_READ_GROUP = null; - @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") - public String FORCE_PLATFORM = null; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) - public int WINDOW_SIZE = 5; - @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) - public int HOMOPOLYMER_NBACK = 7; - @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) - public boolean EXCEPTION_IF_NO_TILE = false; - @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") - public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; - @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) - public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelDBRateWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelDBRateWalker.java deleted file mode 100644 index 189db5206..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelDBRateWalker.java +++ /dev/null @@ -1,265 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.PrintStream; -import java.util.*; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Apr 21, 2010 - */ -@Reference(window=@Window(start=-40,stop=40)) -public class IndelDBRateWalker extends RodWalker { - @Output - PrintStream out; - @Argument(fullName="indelWindow",doc="size of the window in which to look for indels; max 40",required=false) - int indelWindow = 10; - @Argument(fullName="writeVCF",doc="Writes \"overlapping\" variants to this vcf",required=false) - PrintStream outVCF; - - VCFWriter vcfWriter; - - private List compContexts = new ArrayList(50); // not going to be more than 50 contexts in a size-40 window - private List evalContexts = new ArrayList(50); - - public void initialize() { - if ( indelWindow > 40 ) { - throw new UserException.CommandLineException("Indel windows have a maximum size of 40"); - } - - if ( outVCF != null ) { - vcfWriter = new StandardVCFWriter(outVCF); - Set header = new HashSet(); - header.addAll(VCFUtils.getHeaderFields(getToolkit())); - VCFHeader vcfHeader = new VCFHeader(header, SampleUtils.getUniqueSamplesFromRods(getToolkit())); - vcfWriter.writeHeader(vcfHeader); - } - } - - public OverlapTabulator reduceInit() { - return new OverlapTabulator(); - } - - public OverlapTabulator reduce(OverlapTable oTable, OverlapTabulator tabulator) { - if ( oTable != null ) { - tabulator.update(oTable); - } - - return tabulator; - } - - private void finalUpdate(OverlapTabulator tab) { - while ( ! evalContexts.isEmpty() ) { - tab.update(emptyOverlapTable(getToolkit().getGenomeLocParser())); - } - } - - public void onTraversalDone(OverlapTabulator tabulation) { - finalUpdate(tabulation); - out.printf("%s\t%s\t%s\t%s\t%s%n","Num_eval_sites","Num_comp_within_2bp","Num_comp_within_4bp","Num_comp_within_window","Total_comp_%"); - out.printf("%s",tabulation.tabulateString()); - } - - public OverlapTable map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - // step 1: get the eval and compare contexts - if ( tracker == null ) { - return null; - } - Object eval = tracker.getReferenceMetaData("eval") != null && - tracker.getReferenceMetaData("eval").size() != 0 ? tracker.getReferenceMetaData("eval").get(0) : null ; - VariantContext evalContext; - if ( eval != null ) { - evalContext = VariantContextAdaptors.toVariantContext("eval",eval,ref); - } else { - evalContext = null; - } - - Object comp = tracker.getReferenceMetaData("comp") != null && - tracker.getReferenceMetaData("comp").size() != 0 ? tracker.getReferenceMetaData("comp").get(0) : null; - VariantContext compContext; - if ( comp != null ) { - compContext = VariantContextAdaptors.toVariantContext("comp",comp, ref); - } else { - compContext = null; - } - // step 2: add indel contexts to the queue - addToQueue(compContexts,compContext); - addToQueue(evalContexts,evalContext); - // step 3: check to see if we have exceeded the window size for the top eval contexts of the queue - // and do the work - return getOverlapTable(ref); - } - - public void addToQueue(List queue, VariantContext con) { - if ( con != null && con.isIndel() ) { - queue.add(con); - } - } - - public OverlapTable getOverlapTable(ReferenceContext ref) { - // step 1: check that the eval queue is non-empty and that we are outside the window - if ( evalContexts.isEmpty() || VariantContextUtils.getLocation(ref.getGenomeLocParser(),evalContexts.get(0)).distance(ref.getLocus()) <= indelWindow ) { - return null; - } - // step 2: discard all comp variations which come before the window - while ( ! compContexts.isEmpty() && VariantContextUtils.getLocation(ref.getGenomeLocParser(),compContexts.get(0)).isBefore(ref.getLocus()) && - VariantContextUtils.getLocation(ref.getGenomeLocParser(),compContexts.get(0)).distance(ref.getLocus()) > indelWindow) { - compContexts.remove(0); - } - // step 3: see if there are any contexts left; if so then they must be within the window - if ( ! compContexts.isEmpty() ) { - return nonEmptyOverlapTable(ref); - } else { - return emptyOverlapTable(ref.getGenomeLocParser()); - } - } - - public OverlapTable emptyOverlapTable(GenomeLocParser genomeLocParser) { - // only eval, no comp - OverlapTable ot = new OverlapTable(genomeLocParser); - ot.setEvalSizeAndType(evalContexts.get(0)); - return ot; - } - - public OverlapTable nonEmptyOverlapTable(ReferenceContext ref) { - if ( vcfWriter != null ) { - int i = 0; - while ( i < compContexts.size() && VariantContextUtils.getLocation(ref.getGenomeLocParser(),compContexts.get(i)).isBefore(VariantContextUtils.getLocation(ref.getGenomeLocParser(),evalContexts.get(0)))) { - vcfWriter.add(compContexts.get(i),compContexts.get(i).getReference().getBases()[0]); - i++; - } - vcfWriter.add(evalContexts.get(0), ref.getBase()); - while ( i < compContexts.size() && VariantContextUtils.getLocation(ref.getGenomeLocParser(),compContexts.get(i)).distance(VariantContextUtils.getLocation(ref.getGenomeLocParser(),evalContexts.get(0))) <= indelWindow) { - vcfWriter.add(compContexts.get(i), compContexts.get(i).getReference().getBases()[0]); - i++; - } - } - OverlapTable ot = new OverlapTable(ref.getGenomeLocParser()); - ot.setCompOverlaps(compContexts.size()); - ot.setDistances(compContexts,evalContexts.get(0), indelWindow); - return ot; - } - - -} -class OverlapTable { - private GenomeLocParser genomeLocParser; - - private int numOverlaps; - private ExpandingArrayList distances; // currently unused - private int evalSize; - private boolean isDeletion; - - public OverlapTable(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - numOverlaps = 0; - } - - public void setEvalSizeAndType(VariantContext context) { - int size = context.getAlternateAllele(0).length(); - evalSize = size; - isDeletion = context.isDeletion(); - } - - public void setCompOverlaps(int overlaps) { - numOverlaps = overlaps; - } - - public void setDistances(List comps, VariantContext eval, int winsize) { - distances = new ExpandingArrayList(); - for ( VariantContext comp : comps ) { - if ( VariantContextUtils.getLocation(genomeLocParser,comp).distance(VariantContextUtils.getLocation(genomeLocParser,eval)) <= winsize ) { - distances.add(VariantContextUtils.getLocation(genomeLocParser,comp).distance(VariantContextUtils.getLocation(genomeLocParser,eval))); - } - } - } - - public int getNumOverlaps() { - return numOverlaps; - } - - public int getSize() { - return evalSize; - } - - public boolean isDeletion() { - return isDeletion; - } - - public ExpandingArrayList getDistances() { - return distances; - } -} - -class OverlapTabulator { - HashMap hitsToCounts; - int totalEvalVariants; - - public OverlapTabulator() { - hitsToCounts = new HashMap(); - totalEvalVariants = 0; - } - - public void update(OverlapTable table) { - totalEvalVariants++; - if ( table.getNumOverlaps() != 0 ) { - if ( hitsToCounts.containsKey(table.getNumOverlaps()) ) { - hitsToCounts.put(table.getNumOverlaps(),hitsToCounts.get(table.getNumOverlaps())+1); - } else { - hitsToCounts.put(table.getNumOverlaps(),1l); - } - } - } - - public String tabulateString() { - StringBuffer sb = new StringBuffer(); - sb.append(totalEvalVariants); - sb.append("\t"); - long lt2counts = 0l; - long lt4counts = 0l; - long totalCounts = 0l; - for ( int i = 0; i < 40; i ++ ) { - long counts = 0; - if ( hitsToCounts.containsKey(i) ) { - counts = hitsToCounts.get(i); - } - if ( i <= 2 ) { - lt2counts += counts; - } - if ( i <= 4 ) { - lt4counts += counts; - } - totalCounts += counts; - } - sb.append(lt2counts); - sb.append("\t"); - sb.append(lt4counts); - sb.append("\t"); - sb.append(totalCounts); - sb.append("\t"); - sb.append(String.format("%.2f", 100*( (double) totalCounts)/( (double) totalEvalVariants))); - return sb.toString(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelErrorRateWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelErrorRateWalker.java deleted file mode 100644 index 302c071d5..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelErrorRateWalker.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.CircularArray; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import java.util.LinkedList; -import java.util.Iterator; -import java.io.PrintStream; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jan 5, 2010 - * Time: 5:25:02 PM - * To change this template use File | Settings | File Templates. - */ -@Reference(window=@Window(start=-10,stop=10)) -public class IndelErrorRateWalker extends LocusWalker { - @Output - PrintStream out; - @Argument(fullName="minCoverage",shortName="minC",doc="Assess only sites with coverage at or above the specified value.",required=true) - int MIN_COVERAGE = 0; - @Argument(fullName="maxCoverage",shortName="maxC",doc="Assess only sites with coverage at or below the specified value.",required=false) - int MAX_COVERAGE = 1000000000; - @Argument(fullName="maxIndels",shortName="maxI",doc="Assess only sites with no more indels than the specified value.",required=false) - int MAX_INDELS = 1; - @Argument(fullName="minSeparation",shortName="minS", doc="Ignore reference sites within that many bases of a NON-countable indel sites ( > MAX_INDELS ).", - required=true) - int MIN_DISTANCE = 10; - private GenomeLoc skipToLoc = null; - private List countableIndelBuffer = new LinkedList(); - - private long totalObservationsMade = 0; // total number of observations at each reference base (regradless of the outcome); - // in other words, it is Sum_{all assessed ref positions R} coverage(R) - - private CircularArray.Int coverageBuffer = new CircularArray.Int(MIN_DISTANCE); - - private int MAX_LENGTH = 40; - private int[] delCounts = new int[MAX_LENGTH]; - private int[] insCounts = new int[MAX_LENGTH]; - - - @Override - public boolean generateExtendedEvents() { return true; } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public void initialize() { - - } - - private void countIndels(ReadBackedExtendedEventPileup p) { - for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) { - if ( ! pe.isIndel() ) continue; - if ( pe.getEventLength() > MAX_LENGTH ) continue; - if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++; - else delCounts[pe.getEventLength()-1]++; - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - // if we got to ignore a stretch of reference bases (because we have seen a non-countable indel recently), do it now: - if ( skipToLoc != null ) { - if ( ref.getLocus().compareTo(skipToLoc) < 0 ) return 0; - else skipToLoc = null; // reached it, no need to do extra checks anymore - } - - // if we previously cached indels that looked like something we'd like to count: - if ( countableIndelBuffer.size() != 0 ) { - // when we are at least MIN_DISTANCE bases away, we are guaranteed that we are not going - // to run into a NON-countable indel anymore that is so close that it will render last countable indel useless. - - Iterator iter = countableIndelBuffer.iterator(); - while ( iter.hasNext() ) { - - ReadBackedExtendedEventPileup p = iter.next(); - - if ( ref.getLocus().distance(p.getLocation()) >= MIN_DISTANCE ) { - countIndels(p); - iter.remove(); - } else { - break; - } - } - } - - // at this point we have counted (and discarded from the buffer) all indels that are sufficiently far behind - // the current position. Now it's time to examine the pileup at the current position in more details: - - if ( context.hasExtendedEventPileup() ) { - // if we got indels at current position: - - ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads(); - if ( pileup.size() < MIN_COVERAGE ) return 0; - - if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) { - // we got too many indel events. Maybe it's even a true event, and what we are looking for are - // errors rather than true calls. Hence, we do not need these indels. We have to 1) discard - // all remaining indels from the buffer: if they are still in the buffer, they are too close - // to the current position; and 2) make sure that the next position at which we attempt to count again is - // sufficiently far *after* the current position. - // System.out.println("Non countable indel event at "+pileup.getLocation()); - countableIndelBuffer.clear(); - coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well - skipToLoc = ref.getGenomeLocParser().createGenomeLoc(pileup.getLocation().getContig(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1); - // System.out.println("Skip to "+skipToLoc); - } else { - // pileup does not contain too many indels, we need to store them in the buffer and count them later, - // if a non-countable indel event(s) do not show up too soon: - countableIndelBuffer.add(pileup); - } - return 0; - } - - // we are here only if we have a "regular" base pileup; let's count coverage: - - ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); - - int coverage = pileup.size() - pileup.getNumberOfDeletions(); // do not count bases that we did not sequence (deletions) - - if ( coverage < MIN_COVERAGE ) return 0; - // System.out.println("at "+ref.getLocus()+"; adding "+coverageBuffer.get(0)); - - if ( MIN_DISTANCE > 0 ) { - - totalObservationsMade += coverageBuffer.get(0); - coverageBuffer.shiftData(1); - coverageBuffer.set(MIN_DISTANCE-1,coverage); - } else totalObservationsMade += coverage; - return 1; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return value+sum; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public void onTraversalDone(Integer result) { - for ( ReadBackedExtendedEventPileup p : countableIndelBuffer ) { - countIndels(p); - } - for ( int i = 0 ; i < MIN_DISTANCE ; i++ ) { - //System.out.println("done and printing "+coverageBuffer.get(i)); - totalObservationsMade += coverageBuffer.get(i); - } - super.onTraversalDone(result); - - - out.println("Total observations (bases): "+totalObservationsMade); - out.println("Indel error events:"); - out.println("len\tins_count\tins_rate\tdel_count\tdel_rate"); - int totalIns = 0; - int totalDels = 0; - for ( int i = 0 ; i < MAX_LENGTH ; i++ ) { - out.printf("%d\t%d\t%.3g\t%d\t%.3g%n",i+1, - insCounts[i],((double)insCounts[i])/totalObservationsMade, - delCounts[i],((double)delCounts[i])/totalObservationsMade - ); - totalIns += insCounts[i]; - totalDels += delCounts[i]; - } - out.println(); - out.print("Total indel errors found: "+(totalIns+totalDels)); - out.printf(" (rate: %.3g)%n",((double)(totalIns+totalDels))/totalObservationsMade); - out.print(" insertions: "+totalIns); - out.printf(" (rate: %.3g)%n",((double)totalIns)/totalObservationsMade); - out.print(" deletions: "+totalDels); - out.printf(" (rate: %.3g)%n",((double)totalDels)/totalObservationsMade); - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MarkIntervals.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MarkIntervals.java deleted file mode 100644 index 54bdfdf3f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MarkIntervals.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.List; -import java.util.Collection; -import java.util.ArrayList; -import java.io.PrintStream; -import java.io.InputStream; -import java.io.File; -import java.io.FileNotFoundException; - -import net.sf.picard.util.IntervalList; -import net.sf.picard.util.Interval; - -/** - * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but - * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. - * This was its very first use. - */ -public class MarkIntervals extends RodWalker { - @Output - File out; - - @Input(doc="List of bad SNP sites", required=true) - File locs; - - List badSites = new ArrayList(); - IntervalList intervalList; - - @Argument(doc="Should we match intervals or just the sites?", required=false) - boolean matchJustSites = false; - - public void initialize() { - if ( this.getToolkit().getArguments().intervals.size() != 1 ) - throw new UserException("This walker only works with a single -L argument provided, sorry"); - - File intervalFile = new File(this.getToolkit().getArguments().intervals.get(0)); - try { - intervalList = IntervalList.fromFile(intervalFile); - } catch ( Exception ex ) { - throw new UserException.MalformedFile(intervalFile, "Couldn't read interval file", ex); - } - - try { - for ( String line : new XReadLines(locs, true) ) { - String parts[] = line.split(":"); - badSites.add(getToolkit().getGenomeLocParser().createGenomeLoc(parts[0], Integer.valueOf(parts[1]))); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(locs, e); - } - } - - public Long reduceInit() { - return 0l; - } - - public boolean isReduceByInterval() { return true; } - - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) { - return null; - } - - return 0L; - } - - public Long reduce(Long loc, Long prev) { - return 0L; - } - - public void onTraversalDone(List> finalReduce) { - long nBadIntervals = 0; - long nBadBases = 0; - - IntervalList badIntervals = new IntervalList(intervalList.getHeader()); - - if ( matchJustSites ) { - for ( GenomeLoc loc : badSites ) { - nBadIntervals++; - nBadBases += loc.size(); - badIntervals.add(new Interval(loc.getContig(), (int)loc.getStart(), (int)loc.getStop(), false, "nBadSites_" + 1)); - } - } else { - for ( Pair g : finalReduce ) { - int overlaps = 0; - GenomeLoc interval = g.getFirst(); - for ( GenomeLoc loc : badSites ) { - if ( interval.overlapsP(loc) ) { - logger.info(String.format("Overlapping %s with bad site %s", interval, loc)); - overlaps++; - } - } - - if ( overlaps > 0 ) { - nBadIntervals++; - nBadBases += interval.size(); - badIntervals.add(new Interval(interval.getContig(), (int)interval.getStart(), (int)interval.getStop(), false, "nBadSites_" + overlaps)); - //out.printf("%s %d%n", interval, overlaps); - } - } - } - - logger.info("No. intervals marked as bad: " + nBadIntervals); - logger.info("No. bases marked as bad: " + nBadBases); - badIntervals.write(out); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java deleted file mode 100644 index 65e09f0db..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java +++ /dev/null @@ -1,647 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.io.PrintStream; -import java.util.*; - - -/** - * Takes in a VCF file for a trio (and optionally, bam files for some or all members) and classifies mendelian violations - * as deNovo events, or opposite homozygotes, and includes additional information pertaining to event QC (size of - * homozygous region in child and/or parents, whether a site is likely tri-allelic or not, etc) - * - * @Author chartl - * @Date Jun 8, 2010 - */ -public class MendelianViolationClassifier extends LocusWalker { - @Output - PrintStream out; - @Argument(shortName="f",fullName="familyPattern",required=true,doc="Pattern for the family structure (usage: mom+dad=child)") - String familyStr = null; - @Argument(shortName="ob",fullName="outputBed",required=true,doc="Output file to write the homozygous region information to") - PrintStream bedOutput = null; - @Argument(fullName="deNovoTriAllelicQ",required=false,doc="Cutoff for quality scores of 3rd allele at denovo mendelian violations to remove it from the denovo set; e.g. Q40 = need Q40 evidence for 3rd allele to toss out deNovo") - int deNovoTriQ = 20; - @Argument(fullName="deNovoParentalAllele",required=false,doc="Range for the parental allele at denovo sites to be kept in denovo set, e.g. 0.4-0.6 will toss out denovo sites with parental allele proportions of <0.4 and >0.6") - String deNovoParentalAllele = "-0.1-1.1"; - @Argument(fullName="oppositeHomozygoteTriAllelicQ",required=false,doc="Cutoff for quality scores of 3rd allele at opposite homozygote sites to remove it from the violation set") - int opHomTriQ = 20; - @Argument(fullName="oppositeHomozygoteAlleleProportion",required=false,doc="Range for the parental allele in the parents at opposite homozygote sites for it to be kept in violation set") - String opHomAlleleProp = "-0.1-1.1"; - - - /* - *********** PRIVATE CLASSES - */ - - public class ExtendedTrioStructure { - public String mom, dad, child; - public HashMap homozygousRegions; - public HashMap homozygousRegionCounts; - public HashMap regionKeys; - public org.broadinstitute.sting.utils.MendelianViolation mvObject; - - public ExtendedTrioStructure(String family) { - mvObject = new org.broadinstitute.sting.utils.MendelianViolation(family, 0); - this.child = mvObject.getSampleChild(); - this.mom = mvObject.getSampleMom(); - this.dad = mvObject.getSampleDad(); - homozygousRegions = new HashMap(3); - homozygousRegionCounts = new HashMap(3); - homozygousRegions.put(child,null); - homozygousRegions.put(mom,null); - homozygousRegions.put(dad,null); - homozygousRegionCounts.put(child,0); - homozygousRegionCounts.put(mom,0); - homozygousRegionCounts.put(dad,0); - regionKeys = new HashMap(3); - regionKeys.put(child,MendelianInfoKey.ChildHomozygosityRegion); - regionKeys.put(mom,MendelianInfoKey.MotherHomozygosityRegion); - regionKeys.put(dad,MendelianInfoKey.FatherHomozygosityRegion); - } - - public void updateHomozygosityRegions(MendelianViolation v, PrintStream output) { - if ( ! v.siteIsFiltered() ) { - ArrayList brokenRegions = new ArrayList(3); - // can only enter or break regions at unfiltered calls - for( Map.Entry memberGenotype : v.getUnderlyingGenotypes().entrySet() ) { - // for each family member - if ( homozygousRegions.get(memberGenotype.getKey()) == null ) { - // currently in a heterozygous region, update if possible - if ( memberGenotype.getValue().isHom() ) { - homozygousRegionCounts.put(memberGenotype.getKey(),homozygousRegionCounts.get(memberGenotype.getKey())+1); - homozygousRegions.put(memberGenotype.getKey(),new HomozygosityRegion(v.getLocus())); - if ( v.type != MendelianViolationType.NONE ) { - v.addAttribute(regionKeys.get(memberGenotype.getKey()).getKey(),homozygousRegionCounts.get(memberGenotype.getKey())); - } - } - } else { - // potentially breaking a homozygous region - if ( memberGenotype.getValue().isHom() ) { - // no break, update the region - HomozygosityRegion r = homozygousRegions.get(memberGenotype.getKey()); - r.lastSeen = v.getLocus(); - r.callsWithinRegion++; - if ( v.type != MendelianViolationType.NONE && ! v.violationIsFiltered() ) { - v.addAttribute(regionKeys.get(memberGenotype.getKey()).getKey(),homozygousRegionCounts.get(memberGenotype.getKey())); - if ( v.type == MendelianViolationType.DE_NOVO_SNP ) { - r.deNovoSNPsInRegion++; - } else if ( v.type == MendelianViolationType.OPPOSITE_HOMOZYGOTE ) { - r.oppositeHomsInRegion++; - } - } - } else if ( memberGenotype.getValue().isHet() ) { - // explicitly check for hets -- no calls are not counted -- this breaks a region so we print it - homozygousRegions.get(memberGenotype.getKey()).finalize(v.getLocus(),memberGenotype.getKey(),homozygousRegionCounts.get(memberGenotype.getKey())); - brokenRegions.add(homozygousRegions.get(memberGenotype.getKey())); - homozygousRegions.put(memberGenotype.getKey(),null); - } - } - } - - if ( brokenRegions.size() > 0 ) { - Collections.sort(brokenRegions); - } - - for ( HomozygosityRegion r : brokenRegions ) { - output.printf("%s%n",r); - } - } - } - } - - public class HomozygosityRegion implements Comparable{ - public GenomeLoc regionStart; - public GenomeLoc lastSeen; - public GenomeLoc endedBy; - public int callsWithinRegion; - public int oppositeHomsInRegion; - public int deNovoSNPsInRegion; - private String parent; - private int id; - - public HomozygosityRegion(GenomeLoc start) { - regionStart = start; - lastSeen = start; - endedBy = null; - callsWithinRegion = 0; - oppositeHomsInRegion = 0; - deNovoSNPsInRegion = 0; - } - - public void finalize(GenomeLoc regionEnd,String parent, int id) { - endedBy = regionEnd; - this.parent = parent; - this.id = id; - } - - private int getSizeLowerBound() { - return lastSeen.distance(regionStart); - } - - private int getSizeOfFirstHomToFirstHet() { - return endedBy.distance(regionStart); - } - - public String toString() { - return String.format("%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d",regionStart.getContig(),regionStart.getStart(), - lastSeen.getStart(),endedBy.getStart(),parent,id,getSizeLowerBound(),getSizeOfFirstHomToFirstHet(), - callsWithinRegion,oppositeHomsInRegion,deNovoSNPsInRegion); - } - - public int compareTo(Object o) { - if ( ! ( o instanceof HomozygosityRegion) ) { - return Integer.MIN_VALUE; - } - - return this.regionStart.compareTo(((HomozygosityRegion) o).regionStart); - } - - public String getContigStr() { - return regionStart.getContig(); - } - } - - public class MendelianViolation { - private VariantContext trio; - public MendelianViolationType type; - private HashMap newAttributes; - private HashMap homozygosityRegions; - private boolean filtered = false; - - public MendelianViolation(VariantContext context, MendelianViolationType violationType) { - trio = context; - type = violationType; - newAttributes = new HashMap(); - newAttributes.put(MendelianInfoKey.ViolationType.getKey(),type); - homozygosityRegions = new HashMap(3); - } - - public void addAttribute(String key, Object val) { - newAttributes.put(key,val); - } - - public VariantContext toVariantContext() { - newAttributes.putAll(trio.getAttributes()); - return new VariantContext(trio.getSource(), trio.getChr(), trio.getStart(), trio.getEnd(),trio.getAlleles(),trio.getGenotypes(),trio.getNegLog10PError(),trio.filtersWereApplied()?trio.getFilters():null,newAttributes); - } - - public boolean siteIsFiltered() { - return trio.isFiltered(); - } - - public void addRegions(HashMap regionIDsByName ) { - for ( Map.Entry e : regionIDsByName.entrySet() ) { - setRegion(e.getKey(),e.getValue()); - } - } - - public void setRegion(String parent, int regionID) { - homozygosityRegions.put(parent,regionID); - } - - public boolean isInPreviousRegions(Map otherRegions) { - for ( String s : otherRegions.keySet() ) { - if ( homozygosityRegions.get(s) >= otherRegions.get(s) ) { - return false; - } - } - - return true; - } - - public Map getUnderlyingGenotypes() { - return trio.getGenotypes(); - } - - public GenomeLoc getLocus() { - return VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),trio); - } - - public byte getRefBase() { - return trio.getReference().getBases()[0]; - } - - public Object getAttribute(String key) { - if ( newAttributes.keySet().contains(key) ) { - return newAttributes.get(key); - } else { - return trio.getAttribute(key); - } - } - - public void filter() { - filtered = true; - newAttributes.put(MendelianInfoKey.ViolationType.getKey(),"Filtered_"+newAttributes.get(MendelianInfoKey.ViolationType.getKey())); - } - - public String getType() { - return filtered ? "Filtered_"+type.toString() : type.toString(); - } - - public boolean violationIsFiltered() { - return filtered; - } - } - - public class Range { - private double upper; - private double lower; - private double epsilon = 10e-3; - - Range(String rangeStr) { - String rs = rangeStr.substring(0); // don't clobber original string - boolean startIsNegative = rangeStr.startsWith("-"); - if ( startIsNegative ) { - rs = rs.substring(1); - } - String[] lu = rs.split("-"); - lower = startIsNegative ? -1*Double.parseDouble(lu[0]) : Double.parseDouble(lu[0]); - upper = Double.parseDouble(lu[1]); - //System.out.printf("Lower: %.2f, Upper: %.2f",lower,upper); - } - - public boolean contains(double p) { - return p > lower-epsilon && p < upper+epsilon; - } - } - - /* - *************** PRIVATE ENUMS - */ - - public enum MendelianViolationType { - OPPOSITE_HOMOZYGOTE("oppositeHomozygote"), - DE_NOVO_SNP("deNovoSNP"), - NONE("none"); - - private String infoString; - - MendelianViolationType(String typeName) { - infoString=typeName; - } - - public String toString() { - return infoString; - } - } - - public enum MendelianInfoKey { - ViolationType("MVT","String",1,"\"The Mendelian violation type\""), - ParentalDeletion("deletedParent","String",1,"\"The parent from whom the child (putatively) inherited a deletion at opposite homozygous sites\""), - TriAllelicQuality("TriAlQ","Integer",1,"\"The variant quality of the third allele at putative tri-allelic sites\""), - TriAllelicBase("TriAlB","String",1,"\"The third allele at putative tri-allelic sites\""), - MotherHomozygosityRegion("MHR","String",1,"\"An identifier for the mother's homozygosity region where the violation is located\""), - FatherHomozygosityRegion("FHR","String",1,"\"An identifier for the father's homozygosity region where the violation is located\""), - ChildHomozygosityRegion("CHR","Integer",1,"\"An identifier for the child's homozygosity region where the violation is located\""), - ProportionOfParentAllele("PropParent","Float",1,"\"The proportion of bases in the child that were the parent allele at deNovo SNP sites\""), - /* ***************************************** UNUSED ************************************************ */ - NumCallsInRegion("REGION_NCALLS","Integer",1,"\"The number of unfiltered SNP calls found in the homozygosity region\""), - ChildHomozygosityRegionSize("CHRS","Integer",1,"\"The size of the region of homozygosity in the child in which the opposite homozygote is located\""), - OppositeHomozygotesInRegion("CHROH","Integer",1,"\"The number of opposite-homozygotes located in the region of homozygosity\""), - ParentHomozygosityRegionSize("PHRS","Integer",1,"\"The size of the parental homozygosity region where the deNovo SNP is located\""), - DeNovoSNPsInRegion("PHRDN","Integer",1,"\"The number of deNovo SNP events located in the same region of parental homozygosity where the deNovo SNP is located\""); - - - String keyName; - String valueType; - String valueDescription; - int numFields; - - MendelianInfoKey(String keyStr,String infoType, int fields, String description) { - keyName = keyStr; - valueType = infoType; - valueDescription = description; - numFields = fields; - } - - public String toString() { - return String.format("%s,%s,%d,%s",keyName,valueType,numFields,valueDescription); - } - - public String getKey() { - return keyName; - } - } - - /* - **************** PRIVATE DATA - */ - private ExtendedTrioStructure trioStructure; - private UnifiedGenotyperEngine engine; - private Range deNovoRange; - private Range opHomRange; - - /* - ***************** INITIALIZE - */ - public void initialize() { - trioStructure = new ExtendedTrioStructure(familyStr); - deNovoRange = new Range(deNovoParentalAllele); - opHomRange = new Range(opHomAlleleProp); - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.MIN_BASE_QUALTY_SCORE = 10; - uac.MIN_MAPPING_QUALTY_SCORE = 10; - uac.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(deNovoTriQ,opHomTriQ); - engine = new UnifiedGenotyperEngine(getToolkit(),uac); - logger.info("Mom: "+trioStructure.mom+" Dad: "+trioStructure.dad+" Child: "+trioStructure.child); - bedOutput.printf("%s%n",getBedFileHeader()); - } - - /* - *********** REDUCE INIT - */ - public VCFWriter reduceInit() { - VCFWriter writer = new StandardVCFWriter(out); - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "MendelianViolationClassifier")); - for ( MendelianInfoKey key : EnumSet.allOf(MendelianInfoKey.class) ) { - hInfo.add( new VCFHeaderLine("INFO",key.toString())); - } - VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())); - writer.writeHeader(vcfHeader); - - return writer; - } - - /* - ***************** FILTER - */ - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return tracker != null; - } - - /* - *************** MAP - */ - public MendelianViolation map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return assessViolation(tracker.getVariantContext(ref,"trio", EnumSet.of(VariantContext.Type.SNP),ref.getLocus(),true),tracker,ref,context); - } - - private boolean isComplete(VariantContext vc) { - for ( Genotype g : vc.getGenotypes().values() ) { - if ( g.isNoCall() || g.isFiltered() ) { - return false; - } - } - - return true; - } - - private MendelianViolation assessViolation(VariantContext varContext, RefMetaDataTracker tracker, ReferenceContext reference, AlignmentContext context) { - MendelianViolation violation; - if ( varContext != null ) { - if ( isComplete(varContext) && trioStructure.mvObject.isViolation(varContext) ) { - if ( isDeNovo(varContext) ) { - violation = assessDeNovo(varContext,tracker,reference,context); - } else if ( isOppositeHomozygote(varContext) ) { - violation = assessOppositeHomozygote(varContext,tracker,reference,context); - } else { - throw new ReviewedStingException("Mendelian violation that is neither deNovo nor opposite homozygote. Should never see this."); - } - } else { - violation = new MendelianViolation(varContext,MendelianViolationType.NONE); - } - - } else { - violation = null; - } - - return violation; - } - - // Note, mendelian violation is guaranteed at this point; deNovo only happens at child het sites - private boolean isDeNovo(VariantContext trio) { - return trio.getGenotype(trioStructure.child).isHet(); - } - - // Note, mendelian violation is guaranteed at this point; technically we do not have to check this...but we are - private boolean isOppositeHomozygote(VariantContext trio) { - if ( trio.getGenotype(trioStructure.child).isHet() ) { // not valid at child het sites - return false; - } else if ( trio.getHetCount() > 1 ) { // child is not het, so if this is 2, mom and dad are both het, invalid - return false; - } else if ( trio.getGenotype(trioStructure.dad) == null || trio.getGenotype(trioStructure.mom) == null ) { - return false; - } - - return true; - } - - /* - ************ ASSESS DE NOVO AND OPPOSITE HOMOZYGOTES - */ - - private MendelianViolation assessDeNovo(VariantContext trio, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - MendelianViolation violation = new MendelianViolation(trio,MendelianViolationType.DE_NOVO_SNP); - - // look for mis-genotyped sites by examining the proportion of the parent-allele bases in the child -- - // as a spoeedup we do this only at non-filtered sites - - if ( ! trio.isFiltered() ) { - Allele parental = trio.getGenotype(trioStructure.mom).getAllele(0); // guaranteed homozygous - if ( parental.getBases().length < 1 ) { - throw new ReviewedStingException("Parental bases have length zero at "+trio.toString()); - } - - Map splitContext = AlignmentContextUtils.splitContextBySampleName(context); - Double proportion = getAlleleProportion(parental, splitContext.get(trioStructure.child)); - if ( proportion != null ) { - violation.addAttribute(MendelianInfoKey.ProportionOfParentAllele.getKey(), proportion); - if ( ! deNovoRange.contains(proportion) ) { - //System.out.println("Filtering deNovo by proportion: is "+proportion+" should be in range "+deNovoRange.lower+"-"+deNovoRange.upper); - violation.filter(); - } - } - - Pair triAl = getTriAllelicQuality(tracker,ref,trio,splitContext); - if ( triAl != null ) { - violation.addAttribute(MendelianInfoKey.TriAllelicBase.getKey(),triAl.first.toString()); - violation.addAttribute(MendelianInfoKey.TriAllelicQuality.getKey(),triAl.second); - if ( triAl.second >= deNovoTriQ ) { - violation.filter(); - } - } - - } else { - violation.filter(); - } - - return violation; - } - - private MendelianViolation assessOppositeHomozygote(VariantContext trio, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - MendelianViolation violation = new MendelianViolation(trio,MendelianViolationType.OPPOSITE_HOMOZYGOTE); - logger.debug(getParentalDeletion(trio)); - violation.addAttribute(MendelianInfoKey.ParentalDeletion.getKey(),getParentalDeletion(trio)); - - // look for tri-allelic sites mis-called as hom -- as a speedup we do this only at non-filtered, non genotype error sites - - if ( ! trio.isFiltered() ) { - Map splitCon = AlignmentContextUtils.splitContextBySampleName(context); - Pair triAl = getTriAllelicQuality(tracker, ref, trio, splitCon); - if ( triAl != null ) { - violation.addAttribute(MendelianInfoKey.TriAllelicBase.getKey(),triAl.first.toString()); - violation.addAttribute(MendelianInfoKey.TriAllelicQuality.getKey(),triAl.second); - if ( triAl.second >= opHomTriQ ) { - violation.filter(); - } - } - - Double childProp = getAlleleProportion(trio.getGenotype(trioStructure.child).getAllele(0),splitCon.get(trioStructure.child)); - Double motherProp = getAlleleProportion(trio.getGenotype(trioStructure.mom).getAllele(0),splitCon.get(trioStructure.mom)); - Double fatherProp = getAlleleProportion(trio.getGenotype(trioStructure.dad).getAllele(0),splitCon.get(trioStructure.dad)); - if ( childProp != null ) { - violation.addAttribute(MendelianInfoKey.ProportionOfParentAllele.getKey(),childProp); - if ( ! opHomRange.contains(childProp) ) { - violation.filter(); - } - } - - if ( motherProp != null && ! opHomRange.contains(motherProp) ) { - violation.filter(); - } - - if ( fatherProp != null && ! opHomRange.contains(fatherProp) ) { - violation.filter(); - } - } else { - violation.filter(); - } - - return violation; - } - - private Double getAlleleProportion(Allele a, AlignmentContext context) { - int numParental = 0; - int total = 0; - if ( context != null ) { - for ( PileupElement e : context.getBasePileup()) { - if ( e.getQual() >= 10 && e.getMappingQual() >= 10 ) { - total++; - if ( e.getBase() == a.getBases()[0]) { - numParental ++; - } - } - } - return ( (double) numParental )/total; - } else { - return null; - } - - - } - - private Pair getTriAllelicQuality(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext var, Map strat) { - int conf = 0; - Allele alt = null; - for ( Map.Entry sEntry : strat.entrySet() ) { - VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, sEntry.getValue()); - if ( call != null && call.confidentlyCalled ) { - if ( call.isSNP() ) { - if ( ! call.getAlternateAllele(0).basesMatch(var.getAlternateAllele(0))) { - if ( alt == null ) { - alt = call.getAlternateAllele(0); - conf = (int) Math.floor(10*call.getNegLog10PError()); - } else { - conf += (int) Math.floor(10*call.getNegLog10PError()); - } - } - } - } - } - - if ( alt == null ) { - return null; - } else { - return new Pair(alt,conf); - } - } - - private String getParentalDeletion(VariantContext trio) { - // case 1, mom and dad are both hom, so missing is the one whose alleles don't match the child - if ( trio.getGenotype(trioStructure.mom).isHom() && trio.getGenotype(trioStructure.dad).isHom() ) { - if ( trio.getGenotype(trioStructure.mom).isHomRef() == trio.getGenotype(trioStructure.child).isHomRef() ) { - // mom and child both hom ref or hom var - return trioStructure.dad; - } else if ( trio.getGenotype(trioStructure.dad).isHomRef() == trio.getGenotype(trioStructure.child).isHomRef() ) { - return trioStructure.mom; - } else { - // child matches neither parent - return "genotypeError"; - } - } else { // case 2, either mom or dad is het - the hom must be the missing allele - return trio.getGenotype(trioStructure.mom).isHet() ? trioStructure.dad : trioStructure.mom; - } - } - - /* - *************** REDUCE - */ - public VCFWriter reduce(MendelianViolation variant, VCFWriter writer) { - if ( variant != null ) { - trioStructure.updateHomozygosityRegions(variant,bedOutput); - writer.add(variant.toVariantContext(),variant.getRefBase()); - } - - return writer; - } - - /* - ********** ON TRAVERSAL DONE - */ - public void onTraversalDone(VCFWriter writer) { - Map regions = trioStructure.homozygousRegions; - Map counts = trioStructure.homozygousRegionCounts; - List to_print = new ArrayList(3); - for ( Map.Entry entryRegion : regions.entrySet() ) { - if ( entryRegion.getValue() != null ) { - logger.info("---------------- REGION NOT FINALIZED -----------------"); - logger.info(String.format("%s,%s,%s,%d,%d",entryRegion.getKey(),entryRegion.getValue().regionStart,entryRegion.getValue().lastSeen, - entryRegion.getValue().deNovoSNPsInRegion,entryRegion.getValue().oppositeHomsInRegion)); - int chr_end = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(entryRegion.getValue().getContigStr()).getSequenceLength(); - entryRegion.getValue().endedBy = getToolkit().getGenomeLocParser().createGenomeLoc(entryRegion.getValue().getContigStr(),chr_end,chr_end); - to_print.add(entryRegion.getValue()); - } - } - - Collections.sort(to_print); - for ( HomozygosityRegion hr : to_print ) { - bedOutput.printf("%s%n",hr); - } - } - - /* - ***************** STATIC METHODS - */ - - public static String getBedFileHeader() { - return String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s","Chrom","first_seen_hom","last_seen_hom","first_seen_het", - "sample_name","region_id","homozygous_region_size", "size_to_first_het","calls_within_region", - "opposite_homozygotes_in_region","deNovo_SNPs_in_region"); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchCounterWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchCounterWalker.java deleted file mode 100755 index 10b7d5d2a..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchCounterWalker.java +++ /dev/null @@ -1,54 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import java.io.PrintStream; - -@WalkerName("CountMismatches") -public class MismatchCounterWalker extends ReadWalker { - @Output - PrintStream out; - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - int nMismatches = 0; - - int start = read.getAlignmentStart()-1; - int stop = read.getAlignmentEnd(); - // sometimes BWA outputs screwy reads - if ( stop - start > ref.getBases().length ) - return 0; - - if ( read.getAlignmentBlocks().size() == 1 ) { - // No indels - List refSeq = Utils.subseq(ref.getBases()); - List readBases = Utils.subseq(read.getReadBases()); - - assert(refSeq.size() == readBases.size()); - - out.printf("start, stop = %d %d%n", start, stop); - out.println(read.format()); - out.println(Utils.baseList2string(refSeq)); - out.println(Utils.baseList2string(readBases)); - for ( int i = 0; i < refSeq.size(); i++) { - if ( refSeq.get(i) != readBases.get(i) ) - nMismatches++; - } - out.println(nMismatches); - } - - return nMismatches; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchHistoWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchHistoWalker.java deleted file mode 100755 index 2c99315ca..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchHistoWalker.java +++ /dev/null @@ -1,90 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import static java.lang.reflect.Array.*; -import java.io.PrintStream; - -@WalkerName("Mismatch_Histogram") -public class MismatchHistoWalker extends ReadWalker { - @Output - PrintStream out; - - protected long[] mismatchCounts = new long[0]; - protected final int MIN_TARGET_EDIT_DISTANCE = 5; - protected final int MAX_TARGET_EDIT_DISTANCE = 10; - - // Do we actually want to operate on the context? - public boolean filter(ReferenceContext ref, SAMRecord read) { - // we only want aligned reads - return !read.getReadUnmappedFlag(); - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - int editDist = Integer.parseInt(read.getAttribute("NM").toString()); - - // ignore alignments with indels for now - if ( read.getAlignmentBlocks().size() == 1 && - editDist >= MIN_TARGET_EDIT_DISTANCE && - editDist <= MAX_TARGET_EDIT_DISTANCE ) { - - int start = read.getAlignmentStart()-1; - int stop = read.getAlignmentEnd(); - // sometimes BWA outputs screwy reads - if ( stop - start > ref.getBases().length ) - return 0; - - List refSeq = Utils.subseq(ref.getBases()); - List readBases = Utils.subseq(read.getReadBases()); - assert(refSeq.size() == readBases.size()); - - // it's actually faster to reallocate a resized array than to use ArrayLists... - if ( ref.getBases().length > mismatchCounts.length ) { - int oldLength = mismatchCounts.length; - mismatchCounts = (long[])resizeArray(mismatchCounts, refSeq.size()); - for ( int i = oldLength; i < refSeq.size(); i++ ) - mismatchCounts[i] = 0; - } - - String refStr = Utils.baseList2string(refSeq).toUpperCase(); - String readStr = Utils.baseList2string(readBases).toUpperCase(); - - boolean reverseFlag = read.getReadNegativeStrandFlag(); - for ( int i = 0; i < refStr.length(); i++) { - if ( refStr.charAt(i) != readStr.charAt(i) ) - mismatchCounts[(reverseFlag ? (refStr.length()-1-i) : i)]++; - } - } - - return 1; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer result) { - for ( int i = 0; i < mismatchCounts.length; i++ ) - out.println((i+1) + "\t" + mismatchCounts[i]); - } - - private static Object resizeArray (Object oldArray, int newSize) { - int oldSize = getLength(oldArray); - Class elementType = oldArray.getClass().getComponentType(); - Object newArray = newInstance(elementType,newSize); - int preserveLength = Math.min(oldSize,newSize); - if (preserveLength > 0) - System.arraycopy (oldArray,0,newArray,0,preserveLength); - return newArray; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/NeighborhoodQualityWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/NeighborhoodQualityWalker.java deleted file mode 100755 index 884a71598..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/NeighborhoodQualityWalker.java +++ /dev/null @@ -1,177 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.commandline.Output; -import net.sf.samtools.SAMRecord; - -import java.util.List; -import java.util.Iterator; -import java.io.PrintStream; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 23, 2009 - * - * This walker is designed to work as the first pass in a two-pass processing step. - * It does a by-locus traversal calculating a neighborhood quality score based on a number of factors: - * 1.) Number of reads in neighborhood whose mate is mapped to a different chromosome. - * 2.) Average mapping quality of reads in the neighborhood. - * 3.) Average reference mismatch rate for all reads in the neighborhood. - * The output file is a list of: (GenomeLoc QualityScore) for every locus - * - * This walker is designed to be used in conjunction with ReadQualityScoreWalker. - */ - -/** - * Example of what the output should look like: - * - * 1:10999919 25.781164 - * 1:10999920 30.321754 - * 1:10999921 30.321754 - * 1:10999922 30.321754 - * 1:10999923 30.005175 - * 1:10999924 29.82714 - * 1:10999925 29.901012 - * 1:10999926 24.971085 - * 1:10999927 24.634737 - * 1:10999928 21.552652 - * 1:10999929 21.95971 - * 1:10999930 21.95971 - * 1:10999931 20.272423 - * 1:10999932 18.20454 - * 1:10999933 18.20454 - * 1:10999934 18.20454 - */ - -public class NeighborhoodQualityWalker extends LocusWalker { - @Output - PrintStream out; - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - float neighborhoodQualityScore = 0.0f; - int numReadsMismatchedMate = 0; // num of reads in this locus whose mate is mapped to different chromosome - float percentMismatchedMate = 0.0f; // percentage of reads at this locus whose mate is mapped to diff chromosome - float avgMappingQuality = 0.0f; // mean mapping quality for all reads at this locus - float avgMismatchRate = 0.0f; // mean mismatch with reference rate over all reads at this locus - - int numValidMappingQuality = 0; - long sumMappingQuality = 0L; - float sumMismatchRate = 0.0f; - int mappingQuality = 0; // preallocate for use in while loop below - boolean isGoodPair = false; // preallocate for use in while loop below - SAMRecord read = null; // preallocate for use in while loop below - - List reads = context.getReads(); - Iterator readsIter = reads.iterator(); - // EB: the problem with "assert" in java is that it's disabled by default and you actively need - // to enable them on the command line (-ea), so we should really use a formal if statement. - assert reads.size() > 0 : "This locus has no reads."; - - while( readsIter.hasNext() ) { // for each read in this context - read = readsIter.next(); - - // Only consider reads for this calculation whose mapping quality isn't 0 or 255 - mappingQuality = read.getMappingQuality(); - // EB: actually, mapping quality zero reads are crucial to this calculation as they - // affect the overall reliability of this neighborhood (i.e. the more of them that - // there are, the less we trust the alignments in the neighborhood). - if ( mappingQuality > 0 && mappingQuality < 255 ) { - - // Generate sum of mapping quality for all reads at this locus - sumMappingQuality += mappingQuality; - numValidMappingQuality++; - - // Look to see if mate was mapped to different chromosome - //isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true ); - - // EB: I'm pretty sure that getReadPairedFlag() just tells you whether the read has a mate - // I think you want to be checking getMateReferenceIndex() relative to this read's ref index - isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line - if ( !isGoodPair ) { numReadsMismatchedMate++; } - - // Generate sum number of mismatches for all reads at this locus - - // EB: It turns out that the NM attribute is not always set correctly (Mark found several cases) - // so we can't trust it. Check out AlignmentUtils for several methods that already determine the number of mismatches. - if( read.getAttribute("NM") != null ) { - sumMismatchRate += ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength()); - } else { - sumMismatchRate += 1.0f; - } - } - } - - // Calculate averages from sums which accumulated during while loop above - if ( numValidMappingQuality == 0 ) { numValidMappingQuality = 1; } - percentMismatchedMate = ((float) numReadsMismatchedMate) / ((float) numValidMappingQuality); - avgMappingQuality = sumMappingQuality / ((float) numValidMappingQuality); - avgMismatchRate = sumMismatchRate / ((float) numValidMappingQuality); - - // Calculate the three metrics that go into a neighborhood quality score using exponential decay model - // BUGBUG: some analysis is needed to determine reasonable rates and scale factors for the exponential functions - float scoreMates = 40.0f * (float) Math.exp( -16.0f * (float) percentMismatchedMate ); - // exp decay with rate 16.0, scaled to Q=40 when mismatched mates is 0% - float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - avgMappingQuality, 0.0f ) ); - // exp decay with rate 0.02, scaled to Q=40 when avg map quality is 99 - float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * avgMismatchRate ); - // exp decay with rate 27.0, scaled to Q=40 when avg mismatch rate in reads is 0 - - // BUGBUG: some analysis is needed to determine reasonable weights for each metric - neighborhoodQualityScore = 0.35f * scoreMates + 0.05f * scoreMapping + 0.6f * scoreMismatch; - assert neighborhoodQualityScore >= 0.0f : "Neighborhood quality score must be nonnegative."; - if( neighborhoodQualityScore < 1.0f ) { neighborhoodQualityScore = 1.0f; } - - // verbose debug printing lines - logger.debug( context.getLocation() + " " + neighborhoodQualityScore ); - logger.debug( "mate mismatch% =\t" + percentMismatchedMate + " --> " + scoreMates ); - logger.debug( "mean mappingQ =\t" + avgMappingQuality + " --> " + scoreMapping ); - logger.debug( "mean mismatchRate =\t" + avgMismatchRate + " --> " + scoreMismatch ); - - // This printout useful for making histograms of scores in Matlab - //out.println( neighborhoodQualityScore + " " + scoreMates + " " + scoreMapping + " " + scoreMismatch); - - out.println( context.getLocation() + " " + neighborhoodQualityScore ); - return 0; - } - - public Long reduceInit() { - return 0L; - } - - public Long reduce( Integer value, Long sum ) { - return 0L; // nothing to do here - } - - public void onTraversalDone( Long reduceResult ) { - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/PairedQualityScoreCountsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/PairedQualityScoreCountsWalker.java deleted file mode 100644 index cc41b42c1..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/PairedQualityScoreCountsWalker.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import net.sf.samtools.SAMRecord; - -import java.io.PrintStream; - -/** - * This walker prints out quality score counts for first and second reads of a pair aggregated over all reads - * in the interval. - * - * @Author: Chris Hartl - */ -public class PairedQualityScoreCountsWalker extends ReadWalker,Pair> { - @Output - public PrintStream out; - - @Argument(fullName="readLength", shortName="rl", doc="Length of reads in the bam file", required=true) - public int readLength = -1; - - public void initialize() { return; } - - public Pair reduceInit() { - return new Pair( new CycleQualCounts(readLength), new CycleQualCounts(readLength) ); - } - - public Pair reduce( Pair mapCounts, Pair reduceCounts ) { - if ( mapCounts != null ) { - if ( mapCounts.second ) { - reduceCounts.first.update(mapCounts.first); - } else { - reduceCounts.second.update(mapCounts.first); - } - } - - return reduceCounts; - } - - public Pair map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( canUseRead(read) ) { - return getCorrectlyOrientedBaseQualities(read); - } else { - return null; - } - } - - private boolean canUseRead(SAMRecord read) { - return ( ! read.getMateUnmappedFlag() && ! read.getReadUnmappedFlag() ) && ( read.getReadPairedFlag() && read.getReadLength() == readLength ); - } - - private Pair getCorrectlyOrientedBaseQualities(SAMRecord read) { - byte[] quals = read.getReadNegativeStrandFlag() ? Utils.reverse(read.getBaseQualities()) : read.getBaseQualities(); - return new Pair(quals, read.getFirstOfPairFlag()); - } - - public void onTraversalDone(Pair finalCounts) { - StringBuilder output = new StringBuilder(); - output.append(String.format("%s\t%s\t%s%n","Cycle","First_read_counts","Second_read_counts")); - for ( int offset = 0; offset < readLength; offset++ ) { - output.append(String.format("%d\t%s\t%s%n",offset,finalCounts.first.getCountDistribution(offset),finalCounts.second.getCountDistribution(offset))); - } - out.printf("%s",output.toString()); - } - -} - -class CycleQualCounts { - private long[][] qualityCountsByCycle; - private int cycleLength; - private int qualMax = QualityUtils.MAX_REASONABLE_Q_SCORE + 1; - - public CycleQualCounts(int cycleLength) { - this.cycleLength = cycleLength; - qualityCountsByCycle = new long[cycleLength][qualMax]; - for ( int cycle = 0; cycle < cycleLength; cycle++ ) { - for ( int qual = 0; qual < qualMax; qual++) { - qualityCountsByCycle[cycle][qual] = 0; - } - } - } - - public void update(int offset, byte quality) { - qualityCountsByCycle[offset][qualityToQualityIndex(quality)]++; - } - - - public void update(byte[] qualArray) { - for ( int o = 0; o < cycleLength; o++ ) { - update(o,qualArray[o]); - } - } - - private int qualityToQualityIndex(byte qual) { - return qual < 0 ? 0 : qual > qualMax ? qualMax : qual; - } - - public long[][] getCounts() { return qualityCountsByCycle; } - - public String getCountDistribution(int offset) { - StringBuilder b = new StringBuilder(); - for ( int qual = 0; qual < qualMax-1; qual++ ) { - b.append(String.format("%d;",qualityCountsByCycle[offset][qual])); - } - b.append(String.format("%d",qualityCountsByCycle[offset][qualMax-1])); - - return b.toString(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/Percent20xCoverage.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/Percent20xCoverage.java deleted file mode 100755 index b493e272d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/Percent20xCoverage.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; - -import java.io.PrintStream; - -/** - * Walks over the input data set, calculating the percentage of loci covered by at least 20 reads. - */ -public class Percent20xCoverage extends LocusWalker implements TreeReducible { - @Output(doc="Write count to this file instead of STDOUT") - PrintStream out; - - @Argument(fullName="target_coverage", shortName="coverage", doc="Set the target coverage", required=false) - private int targetCoverage = 20; - - - private long totalLoci; - private long totalCoverage; - - public void initialize () { - totalLoci = 0; - totalCoverage = 0; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - totalLoci++; - int coverage = context.getBasePileup().size(); - totalCoverage += coverage; - if (coverage >= targetCoverage) - return 1; - return 0; - } - - public Long reduceInit() { - return 0l; - } - - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - public Long treeReduce(Long lhs, Long rhs) { - return lhs + rhs; - } - - public void onTraversalDone( Long c ) { - out.println(Math.floor(totalCoverage/totalLoci) + "\t" + (double) c/totalLoci); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/QualityScoreByStrandWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/QualityScoreByStrandWalker.java deleted file mode 100644 index 7d0439eaf..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/QualityScoreByStrandWalker.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import java.io.File; -import java.io.IOException; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; -import net.sf.samtools.SAMRecord; - -import java.util.HashMap; -import java.io.PrintWriter; - -/** - * This walker prints out quality score counts for forward and reverse stranded reads aggregated over all loci - * in the interval. Furthermore, it prints out quality score counts at a particular offset of forward and reverse - * reads, aggregated across all paired-end reads in the interval. - * - * @Author: Chris Hartl - */ -@ReadFilters({PlatformUnitFilter.class}) -public class QualityScoreByStrandWalker extends LocusWalker { - @Argument(fullName="readLength", shortName="rl", doc="Maximum length of the reads in the bam file", required=true) - int maxReadLength = -1; - @Argument(fullName="locusCountsOutput", shortName="lcf", doc="File to print locus count information to", required=true) - String locusOutput = null; - @Argument(fullName="pairCountsOutput", shortName="pcf", doc="File to print pair count information to; when not specified pair count statistics is not collected", required=false) - String pairOutput = null; - @Argument(fullName="useCycle", shortName="c", doc="Use cycle directly rather than strand", required=false) - boolean useCycle = false; - @Argument(fullName="silent", shortName="s", doc="Don't echo results into stdout, just print them into the specified files.", required=false) - boolean silent= false; - @Argument(fullName="minMapQ",shortName="q",doc="Use only reads with mapping quality at or above this value.",required=false) - int MIN_MAPQ = -1; - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application.", required=false) - PlatformUnitFilterHelper dummy; - - public HashMap pairCache = new HashMap(); - - public StrandedCounts reduceInit() { - return new StrandedCounts(maxReadLength); - } - - public StrandedCounts map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - StrandedCounts counts = new StrandedCounts(maxReadLength); - updateCounts(counts,context, ref); - return counts; - } - - public StrandedCounts reduce( StrandedCounts map, StrandedCounts red ) { - map.update(red); - return map; - } - - public void updateCounts( StrandedCounts counts, AlignmentContext context, ReferenceContext ref ) { - ReadBackedPileup p = context.getBasePileup().getMappingFilteredPileup(MIN_MAPQ); - for ( PileupElement e : p ) { - updateLocus(counts,e,ref); - if ( pairOutput != null ) updateReads(counts,e,ref); - } - } - - public void updateLocus( StrandedCounts counts, PileupElement e, ReferenceContext ref ) { - if ( ! useCycle ) { - counts.updateLocus( (int) e.getQual(), ! e.getRead().getReadNegativeStrandFlag() ); - } else { - if ( e.getRead().getReadPairedFlag() ) { - counts.updateLocus( (int) e.getQual(), e.getRead().getFirstOfPairFlag() ); - } else { - counts.updateLocus( (int) e.getQual(), true ); - } - } - } - - public void updateReads( StrandedCounts counts, PileupElement e, ReferenceContext ref ) { - SAMRecord read = e.getRead(); - String readString = read.getReadName() + read.getReadNegativeStrandFlag(); - String mateString = read.getReadName() + !read.getReadNegativeStrandFlag(); - if ( pairCache.containsKey(readString) ) { // read is already in there - // do nothing - } else if ( pairCache.containsKey( mateString ) ) { // has the mate - byte[] mate = (byte[]) pairCache.remove(mateString); - updatePairCounts(counts,ref,e,read,mate); - } else { // has neither read nor mate - pairCache.put(readString, read.getBaseQualities() ); // only store qualities, should help gc going haywire - } - } - - public void updatePairCounts( StrandedCounts counts, ReferenceContext ref, PileupElement e, SAMRecord read, byte[] mateQuals ) { - byte[] readQuals = read.getBaseQualities(); - if ( ! useCycle ) { - if ( read.getReadNegativeStrandFlag() ) { - updateReadQualities(mateQuals,readQuals,counts); - } else { - updateReadQualities(readQuals,mateQuals,counts); - } - } else { - if ( read.getFirstOfPairFlag() ) { - updateReadQualities(readQuals,mateQuals,counts); - } else { - updateReadQualities(mateQuals,readQuals,counts); - } - } - } - - public void updateReadQualities(byte[] forQuals, byte[] revQuals, StrandedCounts counts) { - for ( int i = 0; i < forQuals.length; i ++ ) { - counts.updateReadPair((int) forQuals[i], (int) revQuals[forQuals.length-1-i],i,forQuals.length-1-i); - } - } - - public void onTraversalDone(StrandedCounts finalCounts) { - try { - if ( ! silent ) { - System.out.println("#$"); //delimeter - System.out.print(finalCounts.locusCountsAsString()); - System.out.println("#$"); - } - PrintWriter locusOut = new PrintWriter(locusOutput); - locusOut.print(finalCounts.locusCountsAsString()); - locusOut.close(); - if ( pairOutput != null ) { - if ( ! silent ) { - System.out.println("Unmatched reads="+pairCache.size()); - System.out.println("#$"); - System.out.println("#$"); - System.out.print(finalCounts.pairCountsAsString()); - System.out.print("#$"); - } - PrintWriter pairOut = new PrintWriter(pairOutput); - pairOut.print(finalCounts.pairCountsAsString()); - pairOut.close(); - } - } catch ( IOException e ) { - throw new UserException.CouldNotCreateOutputFile(new File(pairOutput), e); - } - } -} - -/* - * this class holds four arrays of longs for quality score counts - */ -class StrandedCounts { - public int readLength; - public long[][] forwardCountsByOffset; - public long[][] reverseCountsByOffset; - public long[] forwardCountsLocusAggregate; - public long[] reverseCountsLocusAggregate; - - public StrandedCounts(int maxReadLength) { - readLength = maxReadLength; - forwardCountsByOffset = new long[maxReadLength][QualityUtils.MAX_REASONABLE_Q_SCORE+3]; - reverseCountsByOffset = new long[maxReadLength][QualityUtils.MAX_REASONABLE_Q_SCORE+3]; - forwardCountsLocusAggregate = new long[QualityUtils.MAX_REASONABLE_Q_SCORE+3]; - reverseCountsLocusAggregate = new long[QualityUtils.MAX_REASONABLE_Q_SCORE+3]; - for ( int q = 0; q < QualityUtils.MAX_REASONABLE_Q_SCORE+3; q ++ ) { - for ( int l = 0; l < maxReadLength; l ++ ) { - forwardCountsByOffset[l][q] = 0l; - reverseCountsByOffset[l][q] = 0l; - } - forwardCountsLocusAggregate[q] = 0l; - reverseCountsLocusAggregate[q] = 0l; - } - } - - public void updateLocus( int quality, boolean forward) { - if ( forward ) { - forwardCountsLocusAggregate[quality < 0 ? 0 : quality > 40 ? 40 : quality]++; - } else { - reverseCountsLocusAggregate[quality < 0 ? 0 : quality > 40 ? 40 : quality]++; - } - } - - public void updateReadPair( int fQual, int rQual, int fOff, int rOff ) { // hehe f Off - if ( rOff < 0 || fOff < 0 ) - throw new ReviewedStingException("Offset is negative. Should never happen."); - forwardCountsByOffset[fOff][fQual < 0 ? 0 : fQual > 40 ? 40 : fQual]++; - reverseCountsByOffset[rOff][rQual < 0 ? 0 : rQual > 40 ? 40 : rQual]++; - } - - public void update( StrandedCounts otherCounts ) { - - for ( int q = 0; q < QualityUtils.MAX_REASONABLE_Q_SCORE+3; q ++ ) { - for ( int l = 0; l < readLength; l ++ ) { - forwardCountsByOffset[l][q] += otherCounts.forwardCountsByOffset[l][q]; - reverseCountsByOffset[l][q] += otherCounts.reverseCountsByOffset[l][q]; - } - - forwardCountsLocusAggregate[q] += otherCounts.forwardCountsLocusAggregate[q]; - reverseCountsLocusAggregate[q] += otherCounts.reverseCountsLocusAggregate[q]; - } - } - - public String pairCountsAsString() { - StringBuffer buf = new StringBuffer(); - StringBuffer check = new StringBuffer(); - String test = ""; - for ( int i = 0; i < readLength; i ++ ) { - //System.out.println("APPENDING LINE: "+i); - buf.append(i); - check.append(i); - test = test+i; - for ( int j = 0; j < QualityUtils.MAX_REASONABLE_Q_SCORE+3; j ++ ) { - buf.append("\t"); - buf.append(forwardCountsByOffset[i][j]); - test = test+"\t"+forwardCountsByOffset[i][j]; - buf.append(";"); - buf.append(reverseCountsByOffset[i][j]); - test = test+"\t"+reverseCountsByOffset[i][j]; - } - test = test+"\n"; - buf.append("\n"); - check.append("\n"); - } - //System.out.print(check.toString()); - return buf.toString(); - } - - public String locusCountsAsString() { - StringBuffer buf = new StringBuffer(); - for ( int i = 0; i < forwardCountsLocusAggregate.length; i ++ ) { - buf.append(i); - buf.append("\t"); - buf.append(forwardCountsLocusAggregate[i]); - buf.append("\t"); - buf.append(reverseCountsLocusAggregate[i]); - buf.append(String.format("%s%n","")); - } - - return buf.toString(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadErrorRateWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadErrorRateWalker.java deleted file mode 100755 index 9f41ddc69..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadErrorRateWalker.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import net.sf.samtools.SAMRecord; - -import java.util.HashMap; -import java.util.Map; -import java.io.PrintStream; - -/** - * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the - * read to its home on the reference and noting the mismatch rate. It ignores reads with - * indels in them, treats high and low-quality reference bases the same, and does not count - * ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads - * in short order. - * - * @author Kiran Garimella - */ -public class ReadErrorRateWalker extends ReadWalker implements TreeReducible { - @Output PrintStream out; - @Argument(fullName="printVisualHits", shortName="v", doc="print visual hits", required=false) public boolean printVisualHits = false; - @Argument(fullName="useNextBestBase", shortName="nb", doc="use next best base", required=false) public boolean useNextBestBase = false; - @Argument(fullName="useNonNextBestBase",shortName="nnb",doc="use nonnext best base",required=false) public boolean useNonNextBestBase = false; - @Argument(fullName="useNextRandomBase", shortName="nr", doc="use next random base", required=false) public boolean useNextRandomBase = false; - - /** - * Ignore reads with indels or clipping - * - * @param read the read to assess - * @return true if the read can be processed, false if it should be ignored - */ - public boolean filter(ReferenceContext ref, SAMRecord read) { - return (read.getCigar().numCigarElements() == 1 && read.getReadLength() <= ref.getBases().length && (!useNonNextBestBase || read.getAttribute("SQ") != null)); - } - - /** - * For each read, return a boolean array indicating the locations of the mismatch. - * Length of the array is one element longer than the read length. The last element - * of this array is always "true" so that we can figure out how many reads we - * processed in a thread-safe manner. - * - * @param read the read to assess - * @return An array of length (read_length + 1) indicating where the mismatches occur. - * Last element is for internal use so the reduce() function can figure out how - * many reads we processed. - */ - public boolean[] map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - boolean[] errorsPerCycle = new boolean[read.getReadLength() + 1]; - - byte[] bases = read.getReadBases(); - byte[] sq = (byte[]) read.getAttribute("SQ"); - - if (printVisualHits) { - System.out.println(read.getReadName()); - for (int cycle = 0; cycle < bases.length; cycle++) { - System.out.print((char) bases[cycle]); - } - System.out.println(); - - for (int cycle = 0; cycle < bases.length; cycle++) { - byte compBase = convertIUPACBaseToSimpleBase(ref.getBases()[cycle]); - - System.out.print((char) compBase); - } - System.out.println("\n"); - } - - for (int cycle = 0; cycle < bases.length; cycle++) { - byte compBase = convertIUPACBaseToSimpleBase(ref.getBases()[cycle]); - - if (compBase != '.') { - if (useNextBestBase || useNextRandomBase || useNonNextBestBase) { - byte nextBestBase; - if (useNextBestBase) { - nextBestBase = BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(sq[cycle])); - } else if (useNonNextBestBase) { - nextBestBase = bases[cycle]; - while (nextBestBase == bases[cycle] || nextBestBase == BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(sq[cycle]))) { - nextBestBase = BaseUtils.baseIndexToSimpleBase(GenomeAnalysisEngine.getRandomGenerator().nextInt(4)); - } - } else { - nextBestBase = bases[cycle]; - while (nextBestBase == bases[cycle]) { - nextBestBase = BaseUtils.baseIndexToSimpleBase(GenomeAnalysisEngine.getRandomGenerator().nextInt(4)); - } - } - - if (nextBestBase != '.') { - if (read.getReadNegativeStrandFlag()) { - errorsPerCycle[bases.length - cycle - 1] = !(bases[cycle] == compBase || nextBestBase == compBase); - } else { - errorsPerCycle[cycle] = !(bases[cycle] == compBase || nextBestBase == compBase); - } - } - } else { - if (read.getReadNegativeStrandFlag()) { - errorsPerCycle[bases.length - cycle - 1] = !(bases[cycle] == compBase); - } else { - errorsPerCycle[cycle] = !(bases[cycle] == compBase); - } - } - } - } - - // We encode that we saw a read in the last position of the array. - // That way we know what to normalize by, and we get thread safety! - errorsPerCycle[errorsPerCycle.length - 1] = true; - - return errorsPerCycle; - } - - private byte convertIUPACBaseToSimpleBase(byte iupacBase) { - char compBase; - - switch (iupacBase) { - case 'A': - case 'a': compBase = 'A'; break; - case 'C': - case 'c': compBase = 'C'; break; - case 'G': - case 'g': compBase = 'G'; break; - case 'T': - case 't': compBase = 'T'; break; - default: compBase = '.'; break; - } - - return (byte) compBase; - } - - /** - * We don't initialize the array here because we need to know how long the read is first. - * - * @return null - */ - public ReadErrorRateCollection reduceInit() { - return new ReadErrorRateCollection(); - } - - /** - * Summarize the error rate data. - * - * @param value the read mismatch array - * @param collection the summed mismatch array - * @return the summed mismatch array with the new read mismatch array added - */ - public ReadErrorRateCollection reduce(boolean[] value, ReadErrorRateCollection collection) { - - collection.update(value); - - return collection; - } - - /** - * For multithreading - take two read error rate collections and put them together - * @param left one collection - * @param right another collection - * @return left updated with the counts from right - */ - public ReadErrorRateCollection treeReduce(ReadErrorRateCollection left, ReadErrorRateCollection right) { - left.merge(right); - return left; - } - - /** - * We've processed all the reads. Emit the final, normalized error rate data. - * - * @param collection the summed mismatch arrays - */ - public void onTraversalDone(ReadErrorRateCollection collection) { - - out.print(collection.toString()); - } -} - -class ReadErrorRateCollection { - private HashMap readsByReadLength; - - public ReadErrorRateCollection() { - readsByReadLength = new HashMap(); - } - - public void update(boolean[] mismatchArray) { - if ( ! readsByReadLength.containsKey(mismatchArray.length) ) { - readsByReadLength.put(mismatchArray.length, zeroArray(mismatchArray.length)); - } - - updateErrorCounts(readsByReadLength.get(mismatchArray.length), mismatchArray); - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - for ( int length : readsByReadLength.keySet() ) { - for ( int cycle = 0; cycle < length-1; cycle++) { - int[] counts = readsByReadLength.get(length); - builder.append(length); - builder.append("\t"); - builder.append(cycle); - builder.append("\t"); - builder.append( ( ( double ) counts[cycle] / ( (double) counts[length-1]))); - builder.append("\n"); - } - } - return builder.toString(); - } - - public void merge(ReadErrorRateCollection other) { - for ( Map.Entry errorCounts : other.readsByReadLength.entrySet() ) { - if ( this.readsByReadLength.keySet().contains(errorCounts.getKey()) ) { - mergeCounts(readsByReadLength.get(errorCounts.getKey()),errorCounts.getValue()); - } else { - readsByReadLength.put(errorCounts.getKey(),errorCounts.getValue()); - } - } - } - - private static int[] zeroArray( int length ) { - int[] array = new int[length]; - for ( int ii = 0; ii < length; ii ++ ) { - array[ii] = 0; - } - - return array; - } - - private static void mergeCounts ( int[] addToMe, int[] dontTouchMe ) { - for ( int index = 0; index < addToMe.length; index ++ ) { - addToMe[index] += dontTouchMe[index]; - } - } - - public static void updateErrorCounts(int[] sum, boolean[] value) { - - for (int cycle = 0; cycle < value.length; cycle++) { - sum[cycle] += (value[cycle] ? 1 : 0); - } - - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadQualityScoreWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadQualityScoreWalker.java deleted file mode 100755 index 95911da51..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReadQualityScoreWalker.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 23, 2009 - * - * This walker is designed to work as the second pass in a two-pass processing step. - * It does a by-read traversal calculating a read quality score based on a number of factors: - * 1.) Average neighborhood quality score for the length of the read (data generated by NeighborhoodQualityWalker) - * 2.) Is this read's mate mapped to a different chromosome? - * 3.) The mapping quality for this read. - * 4.) Number of reference mismatches in this read. - * This walker creates a new bam file in which each read is annotated by this read quality score - * in addition if the read quality score is below the given threshold, the read is flagged. - * - * This walker requires as input the file of (GenomeLoc QualityScore)'s generated by NeighborhoodQualityWalker. - * This walker accepts as input a threshold in order to flag reads which are of unacceptable read quality. - * - * This walker is designed to be used in conjunction with NeighborhoodQualityWalker. - */ - -public class - ReadQualityScoreWalker extends ReadWalker { - @Output - protected PrintStream out; - @Argument(fullName = "inputQualityFile", shortName = "if", doc = "Input quality score file generated by NeighborhoodQualityWalker", required = true) - protected String inputQualityFile = null; - @Argument(fullName = "outputBamFile", shortName = "of", doc = "Write output to this BAM filename instead of STDOUT", required = false) - protected SAMFileWriter outputBamFile = null; - @Argument(fullName = "threshold", shortName = "th", doc="Flag reads whose read quality score is below this threshold", required = false) - protected int qualityThreshold = 13; - - private BufferedReader inputReader = null; - private static String line = null; - - public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { - return read; // all the work is done in the reduce step for this walker - } - - public SAMFileWriter reduceInit() { - try { - inputReader = new BufferedReader( new FileReader ( inputQualityFile ) ); - } catch ( FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(new File(inputQualityFile), e); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(new File(inputQualityFile), e); - } - return outputBamFile; - } - - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { - - int readQualityScore = 0; - float meanNeighborhoodQuality = 0.0f; - - // The large block of code below is parsing through the input file and calculating the meanNeighborhoodQuality over the length of the read - // It does this by first skipping in the file to where the current read starts and marking that location - // Next it continues reading lines for the length of the read generating a sum of neighborhood quality - // When it reaches the end of the read it jumps back to the marker so that it can be used by the next read - // BUGBUG: This assumes reads will be sorted by start location - float sumNeighborhoodQuality = 0.0f; - int numLines = 0; - GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc( read ); - if( readLoc.size() > 0 ) { // only calculate mean NQS if the read has a well formed GenomeLoc, if not NQS will be zero - try { - if( line == null ) { - line = inputReader.readLine(); - if( line == null ) { throw new UserException.MalformedFile(new File(inputQualityFile), "Input file is empty" ); } - } - String[] halves = line.split( " ", 2 ); - GenomeLoc curLoc = getToolkit().getGenomeLocParser().parseGenomeLoc( halves[0] ); - while( curLoc.isBefore( readLoc ) ) { // Loop until the beginning of the read - line = inputReader.readLine(); - if( line == null ) { throw new UserException.MalformedFile(new File(inputQualityFile), "Input file doesn't encompass all reads. Can't find beginning of read: " + readLoc ); } - halves = line.split( " ", 2 ); - curLoc = getToolkit().getGenomeLocParser().parseGenomeLoc( halves[0] ); - } - // now we have skipped ahead in the input file to where this read starts - logger.debug( "Starting: " + curLoc + ", read: " + readLoc + "\t size: " + readLoc.size() ); - inputReader.mark( 30 * ( (int)readLoc.size() + 3 ) ); // BUGBUG: Is this a sufficient buffer size? - String savedLine = line; - - while( !curLoc.isPast( readLoc ) ) { // Loop until just past the end of the read - sumNeighborhoodQuality += Float.parseFloat( halves[1] ); - numLines++; - line = inputReader.readLine(); - if( line == null ) { throw new UserException.MalformedFile(new File(inputQualityFile), "Input file doesn't encompass all reads. Can't find end of read: " + readLoc ); } - halves = line.split( " ", 2 ); - curLoc = getToolkit().getGenomeLocParser().parseGenomeLoc( halves[0] ); - } - // now we have parsed the input file up to where the read ends - // reset back to the mark in order to parse the next read in the next call to the reduce function - inputReader.reset(); - line = savedLine; - - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(new File(inputQualityFile), e); - } catch (IOException e ) { - throw new UserException.CouldNotReadInputFile(new File(inputQualityFile), e); - } - - meanNeighborhoodQuality = sumNeighborhoodQuality / ((float) numLines); - } - - - // Find out if this read's mate mapped to a different chromosome - //boolean isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true ); - boolean isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line - - // Get the mapping quality for this read - int mappingQuality = read.getMappingQuality(); - - // Get the number of reference mismatches in this read - assert read.getReadLength() > 0 : "Read length must be greater than zero."; - float mismatchRate = 1.0f; - if( read.getAttribute("NM") != null ) { - mismatchRate = ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength()); - } - - - // Calculate the three additional metrics that go into a read quality score - // BUGBUG: some analysis is needed to determine reasonable quality values and rates for the exponentials - float scoreMate = ( isGoodPair ? 40.0f : 2.0f ); - float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - mappingQuality, 0.0f ) ); - // exp decay with rate 0.02, scaled to Q=40 when mapping quality is 99 - float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * mismatchRate ); - // exp decay with rate 27.0, scaled to Q=40 when the mismatch rate is 0% for this read - - // BUGBUG: some analysis is needed to determine reasonable weights for each metric - readQualityScore = Math.round( 0.6f * meanNeighborhoodQuality + 0.1f * scoreMate + 0.05f * scoreMapping + 0.25f * scoreMismatch ); - if( readQualityScore == 0 ) { readQualityScore = 1; } - assert readQualityScore > 0 : "Read quality score must be positive and nonzero."; - - // Add the read quality score to the read in the new bam file and flag it if quality is below the given threshold - // BUGBUG: which attributes should be set here? - read.setAttribute( "XR", readQualityScore ); - if( readQualityScore < qualityThreshold ) { - read.setAttribute( "ZR", 1 ); - } - - // verbose debug printing lines - logger.debug( read.getReadName() + " " + readQualityScore ); - logger.debug( "neighborhood quality =\t" + meanNeighborhoodQuality ); - logger.debug( "mate mismatch? =\t" + isGoodPair + " --> " + scoreMate ); - logger.debug( "mapping quality =\t" + mappingQuality + " --> " + scoreMapping ); - logger.debug( "ref mismatch rate =\t" + mismatchRate + " --> " + scoreMismatch ); - - // This printout useful for making histograms of scores in Matlab - //out.println( readQualityScore + " " + meanNeighborhoodQuality + " " + scoreMate + " " + scoreMapping + " " + scoreMismatch ); - - // Add the read to the output bam file or output to STDOUT - if ( output != null ) { - output.addAlignment( read ); - } else { - out.println( read.format() ); - } - - return output; - } - - public void onTraversalDone( SAMFileWriter reduceResult ) { - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/RealignedReadCounter.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/RealignedReadCounter.java deleted file mode 100755 index a81136d8c..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/RealignedReadCounter.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.File; -import java.util.*; - -@By(DataSource.READS) -// walker to count realigned reads -public class RealignedReadCounter extends ReadWalker { - - public static final String ORIGINAL_CIGAR_TAG = "OC"; - public static final String ORIGINAL_POSITION_TAG = "OP"; - - @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) - protected String intervalsFile = null; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - - private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; - private boolean intervalWasUpdated = false; - - public void initialize() { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - currentInterval = intervals.hasNext() ? intervals.next() : null; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( currentInterval == null ) { - return 0; - } - - GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) - return 0; - - if ( readLoc.overlapsP(currentInterval) ) { - if ( doNotTryToClean(read) ) - return 0; - - if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { - String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); - // deal with an old bug - if ( read.getCigar().toString().equals(newCigar) ) { - //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); - return 0; - } - - if ( !intervalWasUpdated ) { - intervalWasUpdated = true; - updatedIntervals++; - affectedBases += 20 + getIndelSize(read); - } - updatedReads++; - - } - } else { - do { - intervalWasUpdated = false; - currentInterval = intervals.hasNext() ? intervals.next() : null; - } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); - } - - return 0; - } - - private int getIndelSize(SAMRecord read) { - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.I ) - return 0; - if ( ce.getOperator() == CigarOperator.D ) - return ce.getLength(); - } - logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); - return 0; - } - - private boolean doNotTryToClean(SAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - (BadMateFilter.hasBadMate(read)); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - System.out.println(updatedIntervals + " intervals were updated"); - System.out.println(updatedReads + " reads were updated"); - System.out.println(affectedBases + " bases were affected"); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReplaceQuals.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReplaceQuals.java deleted file mode 100755 index d8f37c01d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ReplaceQuals.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.collections.Pair; -import net.sf.samtools.*; - -import java.util.HashMap; -import java.io.File; -import java.io.PrintStream; - -/** - * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the - * read to its home on the reference and noting the mismatch rate. It ignores reads with - * indels in them, treats high and low-quality references bases the same, and does not count - * ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads - * in short order. - * - * @author Kiran Garimella - */ -public class ReplaceQuals extends ReadWalker { - @Output - public PrintStream out; - - @Argument(shortName="inputQualsBAM",doc="BAM files containing qualities to be replaced",required=true) - public String inputQualsBAM; - - @Argument(shortName="outputBAM", required=false, doc="output BAM file for reads with replaced quals") - public SAMFileWriter outputBAM = null; - - public int MAX_READS_TO_LOAD = -1; - - private HashMap> readNameToPairs; - private int READ_PRINT_MOD = 100000; - private final boolean DEBUG = false; - - public void initialize() { - readNameToPairs = new HashMap>(); - - SAMFileReader samReader = new SAMFileReader(new File(inputQualsBAM)); - samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - - int nReads = 0; - logger.info("Starting to read inputQualsBAM = " + inputQualsBAM); - for ( SAMRecord read : samReader ) { - //System.out.printf("READ is %s%n", read.format()); - - final String name = read.getReadName(); - Pair binding = readNameToPairs.containsKey(name) ? readNameToPairs.get(name) : new Pair(null, null); - if ( read.getFirstOfPairFlag() ) { - binding.first = read; - } else { - binding.second = read; - } - readNameToPairs.put(name, binding); - - if ( ++nReads % READ_PRINT_MOD == 0 ) { - logger.info(String.format(" Read %d reads so far...", nReads)); - } - - if ( nReads > MAX_READS_TO_LOAD && MAX_READS_TO_LOAD != -1 ) - break; - } - - logger.info("Done reading input BAM"); - } - - /** - * - */ - public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - final String name = read.getReadName(); - - if ( readNameToPairs.containsKey(name) ) { - Pair binding = readNameToPairs.get(name); - SAMRecord qRead = read.getFirstOfPairFlag() ? binding.first : binding.second; - if (qRead != null) { - - if ( DEBUG ) { - System.out.printf("Replacing read %s quals with %s%n", read.getReadName(), qRead.getReadName()); - System.out.printf("%s%n", read.getReadName()); - System.out.printf("%s%n", qRead.getReadName()); - System.out.printf("%s%n", read.getReadString()); - System.out.printf("%s%n", qRead.getReadString()); - System.out.printf("%s%n", read.getBaseQualityString()); - System.out.printf("%s%n", qRead.getBaseQualityString()); - - //if (! read.getReadString().equals(qRead.getReadString())) - // throw new RuntimeException(String.format("BUG: equating %s and %s but bases are different", read.getReadName(), qRead.getReadName())); - } - - read.setBaseQualities(qRead.getBaseQualities()); - } - } - - return read; - } - - // ----------------------------------------------------------------------------------------------- - // Standard i/o reduce - // - public void onTraversalDone(SAMFileWriter output) { - if ( output != null ) { - output.close(); - } - } - - public SAMFileWriter reduceInit() { - return outputBAM; - } - - /** - * - * - */ - public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { - if ( output != null ) { - output.addAlignment(read); - } else { - out.println(read.format()); - } - - return output; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimulateReadsForVariants.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimulateReadsForVariants.java deleted file mode 100755 index 914a9d745..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimulateReadsForVariants.java +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.io.PrintWriter; -import java.util.*; - -import net.sf.samtools.*; - -import cern.jet.math.Arithmetic; -import cern.jet.random.Poisson; -import cern.jet.random.engine.MersenneTwister; - -/** - * Generates simulated reads for variants - */ -@Requires(value={}) -@Reference(window=@Window(start=-20,stop=20)) -public class SimulateReadsForVariants extends RefWalker { - @Argument(fullName = "vcf", shortName = "vcf", doc="Variants underlying the reads",required=true) - protected VCFWriter variantsWriter; - - @Argument(fullName = "sites", shortName = "sites", doc="Variants sites",required=true) - protected PrintWriter sitesWriter; - - @Output(fullName = "read", shortName = "reads", doc="Reads corresponding to variants",required=true) - protected StingSAMFileWriter readWriter; - - @Argument(fullName="nSamples", shortName="NS", doc="Number of samples to simulate", required=false) - public int nSamples = 1; - - @Argument(fullName="readDepth", shortName="DP", doc="Read depths to simulate", required=false) - public List readDepths = Arrays.asList(1); - - @Argument(fullName="errorRate", shortName="ER", doc="Phred-scaled error rate", required=false) - public List errorRates = Arrays.asList(20); - - @Argument(fullName="readLengths", shortName="RL", doc="Read length, in bp", required=false) - public List readLengths = Arrays.asList(3); - - public enum ReadSamplingMode { CONSTANT, POISSON }; - @Argument(fullName="readSamplingMode", shortName="RSM", doc="Sampling mode", required=false) - public List samplingModes = Arrays.asList(ReadSamplingMode.CONSTANT); - - @Argument(fullName="variantsPerBin", shortName="VPB", doc="No. of variants to generate for each bin", required=false) - public int variantsPerBin = 1; - - @Argument(fullName="verbose", shortName="verbose", doc="Verbose", required=false) - public boolean verbose = false; - - private class ParameterSet { - int readDepth, readLength; - ReadSamplingMode mode; - byte[] readQuals; - double errorRate; // in abs fraction (0.01 not Q20) - int nVariants = 0; - ParameterSet next = null; - Poisson poissonRandom = null; - Iterator acs; - int nSites = 0; - - public ParameterSet(int readDepth, int readLength, ReadSamplingMode mode, int phredErrorRate, ParameterSet next, List ACs ) { - this.readDepth = readDepth; - this.readLength = readLength; - this.mode = mode; - this.readQuals = new byte[readLength]; - Arrays.fill(readQuals, (byte)phredErrorRate); - this.errorRate = QualityUtils.qualToErrorProb((byte)phredErrorRate); - this.next = next; - nSites = ACs.size(); - acs = ACs.iterator(); - - if ( mode == ReadSamplingMode.POISSON ) - poissonRandom = new Poisson(readDepth, new MersenneTwister((int)RANDOM_SEED)); - } - - public void incCount() { nVariants++; } - public boolean done() { return ! acs.hasNext(); } - public boolean hasNext() { return next != null; } - - public int combinations() { - return nSites + ( hasNext() ? next.combinations() : 0); - } - } - - List alleleCounts = new ArrayList(); - - ParameterSet parameters = null; - SAMFileHeader header = null; - - private static String SAMPLE_PREFIX = "SAMPLE"; - public static final String PROGRAM_RECORD_NAME = "GATK SimulateReadsForVariants"; - - List sampleNames = new ArrayList(); - Map sample2RG = new HashMap(); - - private String sampleName(int i) { return sampleNames.get(i); } - private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } - - private static final long RANDOM_SEED = 1252863495; - private static final Random ran = new Random(RANDOM_SEED); - - int SEPARATION_BETWEEN_SITES = 10; - - private SAMReadGroupRecord createRG(String name) { - SAMReadGroupRecord rg = new SAMReadGroupRecord(name); - rg.setPlatform("ILLUMINA"); - rg.setSample(name); - return rg; - } - - public void initialize() { - // initialize sample I -> sample info map - List sampleRGs = new ArrayList(); - - for ( int i = 0; i < nSamples; i++ ) { - sampleNames.add(String.format("%s%04d", SAMPLE_PREFIX, i)); - SAMReadGroupRecord rg = createRG(sampleName(i)); - sampleRGs.add(rg); - sample2RG.put(sampleName(i), rg); - } - - for ( int i = 0; i <= (2 * nSamples); i++) { - int nCopies = (int)Math.round((2.0* nSamples) / (Math.max(i, 1))); - for ( int j = 0; j < (nCopies * variantsPerBin); j++ ) - alleleCounts.add(i); - } - - // initialize VCF headers - // todo -- fill out header - Set headerLines = new HashSet(); - headerLines.add(new VCFHeaderLine("source", "SimulateReadsForVariants")); - variantsWriter.writeHeader(new VCFHeader(headerLines, new HashSet(sampleNames))); - - // initialize BAM headers - header = new SAMFileHeader(); - header.setSequenceDictionary(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary()); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - header.setReadGroups(sampleRGs); - - final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - programRecord.setProgramVersion(headerInfo.getString("org.broadinstitute.sting.gatk.version")); - programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); - header.setProgramRecords(Arrays.asList(programRecord)); - - readWriter.writeHeader(header); - - // set up feature sets - for ( int readLength : readLengths ) { - if ( readLength % 2 == 0 ) throw new UserException.BadArgumentValue("readLength", "Read lengths must be odd"); - - for ( ReadSamplingMode mode : samplingModes ) { - for ( int errorRate : errorRates ) { - for ( int readDepth : readDepths ) { - parameters = new ParameterSet(readDepth, readLength, mode, errorRate, parameters, alleleCounts); - } - } - } - } - logger.info("Total number of combinations " + parameters.combinations()); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( parameters.done() ) { - if ( parameters.hasNext() ) - parameters = parameters.next; - else - return 0; // early abort, we're done generating - } - - if ( ref.getLocus().getStart() < parameters.readLength || ! BaseUtils.isRegularBase(ref.getBase()) ) - return 0; - - if ( ref.getLocus().getStart() % (parameters.readLength + SEPARATION_BETWEEN_SITES) != 0 ) - return 0; - - byte[] refBases = getBasesForReads(ref, parameters.readLength); - - // at this point, we want to generate variants and reads for the parameters in parameters - int AC = parameters.acs.next(); - VariantContext vc = generateVariant(context.getLocation(), ref.getBase(), AC, parameters); - if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); - ReadBackedPileup rbp = generateRBPForVariant(context.getLocation(), vc, refBases, parameters); - - // BED is zero based - sitesWriter.printf("%s %d %d%n", ref.getLocus().getContig(), ref.getLocus().getStart()-1, ref.getLocus().getStart() ); - variantsWriter.add(vc, ref.getBase()); - for ( SAMRecord read : rbp.getReads() ) readWriter.addAlignment(read); - - parameters.incCount(); - - return 0; - } - - private byte[] getBasesForReads(ReferenceContext ref, int readLength) { - int center = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()); - int start = center - ((readLength - 1) / 2); - byte[] bases = new byte[readLength]; - System.arraycopy(ref.getBases(), start, bases, 0, readLength); - return bases; - } - - private VariantContext generateVariant( GenomeLoc loc, byte refBase, int AC, ParameterSet params ) { - Allele ref = Allele.create(refBase, true); - Allele alt = Allele.create(BaseUtils.baseIndexToSimpleBase(BaseUtils.getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(refBase)))); - List alleles = AC == 0 ? Arrays.asList(ref) : Arrays.asList(ref, alt); - - List homRef = Arrays.asList(ref, ref); - List het = Arrays.asList(ref, alt); - List homAlt = Arrays.asList(alt, alt); - - List genotypes = new ArrayList(); - double p = AC / (2.0 * nSamples); - //double q = 1 - p; - int nHomAlt = (int) Math.round(p * p * nSamples); - int nHet = AC - nHomAlt * 2; - //int nHet = (int) Math.round(2 * p * q * nSamples); - for ( int i = 0; i < nSamples; i++ ) { - List genotype; - - if ( i < nHomAlt ) { genotype = homAlt; } - else if ( i < (nHet + nHomAlt) ) { genotype = het; } - else { genotype = homRef; } - - genotypes.add(new Genotype(sampleName(i), genotype)); - } - - Map attributes = new LinkedHashMap(); - attributes.put(VCFConstants.ALLELE_COUNT_KEY, AC); - attributes.put(VCFConstants.SAMPLE_NUMBER_KEY, nSamples); - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, 2 * nSamples); - attributes.put("Q", params.readQuals[0]); - attributes.put("MODE", params.mode); - attributes.put("DP", params.readDepth); - - return new VariantContext("anonymous", loc.getContig(), loc.getStart(), loc.getStart(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, VariantContext.PASSES_FILTERS, attributes ); - } - - private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, byte[] refBases, ParameterSet params ) { - List reads = new ArrayList(); - int offset = (params.readLength - 1) / 2; - - int start = (int)(loc.getStart() - (params.readLength - 1) / 2); - byte altBase = vc.isVariant() ? vc.getAlternateAllele(0).getBases()[0] : 0; - byte[] refHaplotype = Arrays.copyOf(refBases, refBases.length); - byte[] altHaplotype = Arrays.copyOf(refBases, refBases.length); - altHaplotype[(params.readLength - 1) / 2] = altBase; - - int gi = 0; - for ( Genotype g : vc.getGenotypes().values() ) { - int myDepth = sampleDepth(params); - for ( int d = 0; d < myDepth; d++ ) { - byte[] readBases = trueHaplotype(g, refHaplotype, altHaplotype); - addMachineErrors(readBases, params.errorRate); - - SAMRecord read = new SAMRecord(header); - read.setBaseQualities(params.readQuals); - read.setReadBases(readBases); - read.setReadName("FOO"); - read.setCigarString(params.readLength + "M"); - read.setReadPairedFlag(false); - read.setAlignmentStart(start); - read.setMappingQuality(60); - read.setReferenceName(loc.getContig()); - read.setReadNegativeStrandFlag(gi++ % 2 == 0); - read.setAttribute("RG", sampleRG(g.getSampleName()).getReadGroupId()); - - reads.add(read); - } - } - - return new ReadBackedPileupImpl(loc, reads, offset); - } - - private int sampleDepth(ParameterSet params) { - switch ( params.mode ) { - case CONSTANT: return params.readDepth; - case POISSON: return params.poissonRandom.nextInt(); - default: - throw new IllegalStateException("Unexpected DepthSamplingType " + params.mode); - } - } - - private byte[] trueHaplotype(Genotype g, byte[] refHaplotype, byte[] altHaplotype) { - double refP = 0.0; - - if ( g.isHomRef() ) refP = 1; - else if ( g.isHet() ) refP = 0.5; - else refP = 0.0; - - return Arrays.copyOf(ran.nextDouble() < refP ? refHaplotype : altHaplotype, refHaplotype.length); - } - - private void addMachineErrors(byte[] readBases, double errorRate) { - for ( int i = 0; i < readBases.length; i++ ) { - double r = ran.nextDouble(); - if ( r < errorRate ) { - byte errorBase = BaseUtils.baseIndexToSimpleBase(BaseUtils.getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(readBases[i]))); - if ( errorBase == readBases[i] ) throw new IllegalStateException("Read and error bases are the same"); - readBases[i] = errorBase; - } - } - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - //variantsWriter.close(); - sitesWriter.close(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java deleted file mode 100644 index 9a820c4e3..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.alignment.bwa.BWAAligner; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.StringUtil; -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Scanner; -import java.util.TreeMap; -import java.util.SortedMap; - -/** - * A walker to experiment with fishing for reads in the GATK. Has very limited utility in its current state. - * - * @author mhanna - * @version 0.1 - */ -public class TestReadFishingWalker extends ReadWalker { - /** - * An aligner for the small custom reference. - */ - private BWAAligner aligner; - - @Output - private PrintStream out; - - @Argument(fullName="indel_calls",shortName="ic",doc="Indel calls to use to derive custom references",required=true) - private File indelCalls; - - @Argument(fullName="buffer_width",shortName="bw",doc="How much reference to extract around the given event",required=false) - private int bufferWidth = 36; - - private SortedMap aligners = new TreeMap(); - - @Override - public void initialize() { - long startTime = System.currentTimeMillis(); - int numAlignersCreated = 0; - - IndexedFastaSequenceFile referenceReader; - FileInputStream indelCallInputStream; - try { - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - indelCallInputStream = new FileInputStream(indelCalls); - } - catch(IOException ex) { - throw new UserException.CouldNotReadInputFile(indelCalls, ex); - } - - Scanner indelCallReader = new Scanner(indelCallInputStream); - - while(indelCallReader.hasNext()) { - String contig = indelCallReader.next(); - int eventPos = indelCallReader.nextInt(); - int eventLength = indelCallReader.nextInt(); - char type = indelCallReader.next().toUpperCase().charAt(0); - byte[] bases = StringUtil.stringToBytes(indelCallReader.next()); - String sample = indelCallReader.next(); - - byte[] revisedReference; - int start,stop; - - if(type == 'D') { - start = eventPos-eventLength-bufferWidth; - stop = eventPos+eventLength+bufferWidth; - int eventStart = eventPos - start + 1; - int eventStop = eventStart + eventLength - 1; - - ReferenceSequence referenceSequence = referenceReader.getSubsequenceAt(contig,start,stop); - revisedReference = new byte[(stop-start+1) - eventLength]; - - System.arraycopy(referenceSequence.getBases(),0,revisedReference,0,eventStart); - System.arraycopy(referenceSequence.getBases(),eventStop+1,revisedReference,eventStart,stop-start-eventStop); - - } - else if(type == 'I') { - start = eventPos-bufferWidth; - stop = eventPos+bufferWidth; - int eventStart = eventPos - start + 1; - - ReferenceSequence referenceSequence = referenceReader.getSubsequenceAt(contig,start,stop); - revisedReference = new byte[(stop-start+1) + eventLength]; - - System.arraycopy(referenceSequence.getBases(),0,revisedReference,0,bufferWidth+1); - System.arraycopy(bases,0,revisedReference,eventStart,eventLength); - System.arraycopy(referenceSequence.getBases(),eventStart,revisedReference,eventStart+eventLength,bufferWidth); - } - else - throw new ReviewedStingException("Invalid indel type: " + type); - - aligners.put(getToolkit().getGenomeLocParser().createGenomeLoc(contig,start,stop),new BWACAligner(revisedReference,new BWAConfiguration())); - if(++numAlignersCreated % 100 == 0) - out.printf("Created %d aligners in %dms%n",++numAlignersCreated,System.currentTimeMillis()-startTime); - } - } - - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - Alignment bestAlignment = aligner.getBestAlignment(read.getReadBases()); - System.out.println("bestAlignment = " + bestAlignment); - return 1; - } - - - /** - * Provide an initial value for reduce computations. - * @return Initial value of reduce. - */ - @Override - public Long reduceInit() { - return 0L; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * @param value result of the map. - * @param accum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - @Override - public Long reduce(Integer value, Long accum) { - return value + accum; - } - - @Override - public void onTraversalDone(Long result) { - aligner.close(); - super.onTraversalDone(result); - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java deleted file mode 100755 index a23b064f1..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.Window; - -import java.util.EnumSet; -import java.io.PrintStream; - -/** - * Test routine for new VariantContext object - */ -@Reference(window=@Window(start=-1,stop=1)) -public class TestVariantContextWalker extends RodWalker { - @Output - PrintStream out; - - @Argument(fullName="takeFirstOnly", doc="Only take the first second at a locus, as opposed to all", required=false) - boolean takeFirstOnly = false; - - @Argument(fullName="onlyContextsOfType", doc="Only take variant contexts of this type", required=false) - VariantContext.Type onlyOfThisType = null; - - @Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false) - boolean onlyContextsStartinAtCurrentPosition = false; - - @Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) - boolean printContexts = false; - - @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) - VCFWriter writer = null; - - private boolean wroteHeader = false; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref == null ) - return 0; - else { - EnumSet allowedTypes = onlyOfThisType == null ? null : EnumSet.of(onlyOfThisType); - - int n = 0; - for (VariantContext vc : tracker.getAllVariantContexts(ref, allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) { - - // we need to trigger decoding of the genotype string to pass integration tests - vc.getGenotypes(); - - if ( writer != null && n == 0 ) { - if ( ! wroteHeader ) { - writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc)); - wroteHeader = true; - } - - writer.add(vc, ref.getBase()); - } - - n++; - if ( printContexts ) out.printf(" %s%n", vc); - } - - if ( n > 0 && printContexts ) { - out.printf("%s => had %d variant context objects%n", context.getLocation(), n); - out.printf("---------------------------------------------%n"); - } - - return n; - } - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer point, Integer sum) { - return point + sum; - } - - @Override - public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java deleted file mode 100755 index 1519aa02e..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java +++ /dev/null @@ -1,147 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/** - * Prints out all of the RODs in the input data set. Data is rendered using the toString() method - * of the given ROD. - */ -public class VCF4WriterTestWalker extends RodWalker { - private VCFWriter vcfWriter; - - @Output - private PrintStream out; - - @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) - private String OUTPUT_FILE = null; - - - public static final String INPUT_ROD_NAME = "variant"; - - protected static String line = null; - final TreeSet samples = new TreeSet(); - VCFCodec vcf4codec = new VCFCodec(); - - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - public void initialize() { - final List dataSources = this.getToolkit().getRodDataSources(); - - - // Open output file specified by output VCF ROD - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - - - vcfWriter = new StandardVCFWriter(new File(OUTPUT_FILE)); - VCFHeader header = null; - for( final ReferenceOrderedDataSource source : dataSources ) { - if(source.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { - - try { - AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(source.getFile().getAbsolutePath())); - header = (VCFHeader)vcf4codec.readHeader(lineReader); - out.printf("Read %d header lines%n", header.getMetaData().size()); - } - catch (FileNotFoundException e ) { - throw new ReviewedStingException(e.getMessage()); - } - - final Set vcfSamples = header.getGenotypeSamples(); - samples.addAll(vcfSamples); - vcfWriter.writeHeader(header); - - - } - } - vcfWriter.close(); - - - } - - /** - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - GenomeLoc loc = context.getLocation(); - VariantContext vc = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true); - - - if (vc == null) - return 0; - - // Write directly variant context to VCF4.0 format. - vcfWriter.add(vc, ref.getBase()); - - return 1; - } - - /** - * Increment the number of rods processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of rods processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) {} -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReads.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReads.java deleted file mode 100644 index 5c06c188f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReads.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.commandline.Output; - -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.Map; -import java.io.PrintStream; - -/** - * validate the rods for reads - */ -public class ValidateRODForReads extends ReadWalker { - // a mapping of the position to the count of rods - HashMap map = new LinkedHashMap(); - - @Output - private PrintStream out; - - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - if (tracker != null) { - Map> mapping = tracker.getContigOffsetMapping(); - for (Map.Entry> entry : mapping.entrySet()) { - GenomeLoc location = ref.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),entry.getKey()); - if (!map.containsKey(location)) { - map.put(location,0); - } - map.put(location,map.get(location)+1); - } - - return mapping.size(); - } - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - out.println("[REDUCE RESULT] Traversal result is: " + result + " ROD entries seen"); - for (GenomeLoc location : map.keySet()) { - out.println(location + " -> " + map.get(location)); - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java deleted file mode 100755 index 62490b66f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java +++ /dev/null @@ -1,111 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.annotator; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Oct 20, 2010 - * Time: 3:08:06 PM - * To change this template use File | Settings | File Templates. - */ -public class HammingDistance implements ExperimentalAnnotation, InfoFieldAnnotation { - - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( tracker == null ) { - return null; - } - VariantContext hamCon = tracker.getVariantContext(ref,"hamming",null,ref.getLocus(),true); - if ( hamCon == null ) { - return null; - } - - Set interSamples = new HashSet(vc.getSampleNames()); - interSamples.retainAll(hamCon.getSampleNames()); - - int dist = 0; - int nrd_num = 0; - int hamCalls = 0; - int vcCallsAtHamCalls = 0; - int num_variant = 0; - - for ( String s : interSamples ) { - dist += dist(vc.getGenotype(s),hamCon.getGenotype(s),true); - nrd_num += dist(vc.getGenotype(s),hamCon.getGenotype(s),false); - if ( vc.getGenotype(s).isHet() || vc.getGenotype(s).isHomVar() || hamCon.getGenotype(s).isHet() || hamCon.getGenotype(s).isHomVar() ) { - num_variant ++; - } - if ( hamCon.getGenotype(s).isCalled() ) { - hamCalls++; - if ( vc.getGenotype(s).isCalled() ) { - vcCallsAtHamCalls++; - } - } - - } - - HashMap map = new HashMap(1); - map.put("HMD",dist); - map.put("HCR",(0.0+vcCallsAtHamCalls)/(0.0+hamCalls)); - map.put("NRD",(0.0+nrd_num)/(0.0+num_variant)); - map.put("OGC",(0.0+nrd_num)/(0.0+interSamples.size())); - return map; - - } - - public int dist(Genotype a, Genotype b, boolean weightByAC) { - if ( a.isNoCall() || b.isNoCall() ) { - return 0; - } - if ( weightByAC ) { - if ( a.isHomRef() ) { - if ( b.isHomVar() ) { - return 2; - } else if ( b.isHet() ) { - return 1; - } else { - return 0; - } - } else if ( a.isHet() ) { - if ( b.isHom() ) { - return 1; - } else { - return 0; - } - } else { - if ( b.isHomRef() ) { - return 2; - } else if ( b.isHet() ) { - return 1; - } else { - return 0; - } - } - } else { - if ( ! a.equals(b) ) { - return 1; - } else { - return 0; - } - } - } - - public List getKeyNames() { return Arrays.asList("HMD","HCR","NRD","OGC"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HMD",1, VCFHeaderLineType.Integer,"The hamming distance between record in Hamming ROD and this record"), - new VCFInfoHeaderLine("HCR",1,VCFHeaderLineType.Float,"The differential call rate between record in Hamming ROD and this record"), - new VCFInfoHeaderLine("NRD",1,VCFHeaderLineType.Float,"The Non-reference discrepancy between Hamming ROD and this record"), - new VCFInfoHeaderLine("OGC",1,VCFHeaderLineType.Float,"The Overall Genotype Concordance between Hamming ROD and this one")); } - - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/InsertSizeDistribution.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/InsertSizeDistribution.java deleted file mode 100644 index 95d4b6bf5..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/InsertSizeDistribution.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.annotator; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.HashMap; -import java.util.Map; -import java.util.List; -import java.util.Arrays; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Mar 29, 2010 - */ -public class InsertSizeDistribution implements InfoFieldAnnotation { - private final long INSERT_SIZE_LOWER_BOUND = 500; - public List getKeyNames() { return Arrays.asList("INSIZE"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0),1, VCFHeaderLineType.Integer,"Do not use this if your name is not Chris")); } - - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map context, VariantContext variant) { - int weirdInsertSizeReads = 0; - for ( String sample : context.keySet() ) { - ReadBackedPileup pileup = context.get(sample).getBasePileup(); - for (PileupElement e : pileup ) { - if ( Math.abs(e.getRead().getInferredInsertSize()) > INSERT_SIZE_LOWER_BOUND ) { - weirdInsertSizeReads++; - } - } - } - - Map toReturn = new HashMap(); - toReturn.put(getKeyNames().get(0),String.format("%d",weirdInsertSizeReads)); - return toReturn; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/ProportionOfNonrefBasesSupportingSNP.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/ProportionOfNonrefBasesSupportingSNP.java deleted file mode 100644 index 1af1fa491..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/ProportionOfNonrefBasesSupportingSNP.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.annotator; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; - -import java.util.Map; -import java.util.HashMap; -import java.util.List; -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Dec 17, 2009 - * Time: 2:48:15 PM - * To change this template use File | Settings | File Templates. - */ -public class ProportionOfNonrefBasesSupportingSNP implements InfoFieldAnnotation { - private String KEY_NAME = "prop_nonref_that_are_snp"; - - public List getKeyNames() { return Arrays.asList(KEY_NAME); } - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME,1, VCFHeaderLineType.Float,"Simple proportion of non-reference bases that are the SNP base")); - } - - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map context, VariantContext vc) { - if ( ! vc.isSNP() || ! vc.isBiallelic() ) - return null; - - Pair totalNonref_totalSNP = new Pair(0,0); - for ( String sample : context.keySet() ) { - ReadBackedPileup pileup = context.get(sample).getBasePileup(); - totalNonref_totalSNP = getNonrefAndSNP(pileup, ref.getBaseAsChar(), vc.getAlternateAllele(0).toString().charAt(0), totalNonref_totalSNP); - - } - if ( totalNonref_totalSNP.equals(new Pair(0,0)) ) - return null; - double p = getProportionOfNonrefBasesThatAreSNP(totalNonref_totalSNP); - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%f", p )); - return map; - } - - private Pair getNonrefAndSNP(ReadBackedPileup p, char ref, char snp, Pair totals) { - int[] counts = p.getBaseCounts(); - int nonrefCounts = 0; - int snpCounts = counts[BaseUtils.simpleBaseToBaseIndex(snp)]; - for ( byte c : BaseUtils.BASES ) { - if ( ! BaseUtils.basesAreEqual(c, (byte) ref) ) { - nonrefCounts += counts[BaseUtils.simpleBaseToBaseIndex(c)]; - } - } - - totals.first+=nonrefCounts; - totals.second+=snpCounts; - return totals; - } - - private double getProportionOfNonrefBasesThatAreSNP( Pair totalNonref_totalSNP ) { - return ( 1.0 + totalNonref_totalSNP.second ) / (1.0 + totalNonref_totalSNP.first ); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/QualByDepthV2.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/QualByDepthV2.java deleted file mode 100755 index 416b7e8c2..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/QualByDepthV2.java +++ /dev/null @@ -1,64 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.annotator; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.AnnotationByDepth; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * A Qual By Depth calculation adjusted to account for allele counts by (n var samples)/(n var alleles) -- so an entirely - * homozygous variant receives a penalty of 1/2, while entirely het receives a (multiplicative) penalty of 1 (so no penalty) - * This does not necessarily work well in the case of non-confident genotypes (could over or under penalize) - */ -public class QualByDepthV2 extends AnnotationByDepth implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - final Map genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - //double QbyD = genotypeQualByDepth(genotypes, stratifiedContexts); - int qDepth = annotationByVariantDepth(genotypes, stratifiedContexts); - if ( qDepth == 0 ) - return null; - - double QbyD = hetHomAdjustment(vc) * 10.0 * vc.getNegLog10PError() / (double)qDepth; - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", QbyD)); - return map; - } - - public List getKeyNames() { return Arrays.asList("QD2"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } - - public double hetHomAdjustment(VariantContext vc) { - int variantSamples = 0; - int variantAlleles = 0; - for ( Genotype g : vc.getGenotypesSortedByName() ) { - if ( ! g.isFiltered() ) { - if ( g.isHet() ) { - variantSamples++; - variantAlleles++; - } else if ( g.isHomVar() ) { - variantSamples++; - variantAlleles += 2; - } - } - } - - return (0.0+variantSamples)/(0.0+variantAlleles); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/haplotype/ComputeRSquaredAndDPrime.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/haplotype/ComputeRSquaredAndDPrime.java deleted file mode 100755 index d2fbd26da..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/haplotype/ComputeRSquaredAndDPrime.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.haplotype; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; - -import java.io.PrintStream; - -/** - * A very simple code snippet from Steve Schaffner that computs R^2 and D' given observed counts for - * AA, Aa, aA, and aa genotypes. This code is meant only to be instructive. Do not use this for - * anything, ever! - */ -public class ComputeRSquaredAndDPrime extends RodWalker { - @Argument(fullName="AA", shortName="AA", doc="Number of counts for AA genotype") public Integer AA; - @Argument(fullName="Aa", shortName="Aa", doc="Number of counts for Aa genotype") public Integer Aa; - @Argument(fullName="aA", shortName="aA", doc="Number of counts for aA genotype") public Integer aA; - @Argument(fullName="aa", shortName="aa", doc="Number of counts for aa genotype") public Integer aa; - - @Output - private PrintStream out; - - public void initialize() { - int i, j; - Integer[][] hap = new Integer[2][2]; - for (int k = 0; k < 2; k++) { - hap[k] = new Integer[2]; - } - - hap[0][0] = AA; - hap[0][1] = Aa; - hap[1][0] = aA; - hap[1][1] = aa; - - Integer[] colTot = new Integer[2]; - Integer[] rowTot = new Integer[2]; - double prod, dprime, f1, f2, ddenom, r2; - - for (i = 0; i < 2; i++) { rowTot[i] = colTot[i] = 0; } - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - rowTot[j] += hap[i][j]; - colTot[i] += hap[i][j]; - } - } - prod = rowTot[0] * rowTot[1] * colTot[0] * colTot[1]; - - if (prod == 0) { - out.println("Missing data"); - System.exit(1); - } - dprime = hap[0][0] * hap[1][1] - hap[0][1] * hap[1][0]; - if (dprime > 0) { - f1 = rowTot[0] * colTot[1]; - f2 = rowTot[1] * colTot[0]; - ddenom = (f1 > f2) ? f2 : f1; - } else { - f1 = rowTot[0] * colTot[0]; - f2 = rowTot[1] * colTot[1]; - ddenom = (f1 > f2) ? f2 : f1; - } - r2 = dprime * dprime / prod; - dprime /= ddenom; - - out.printf("r2: %.5f%n", r2); - out.printf("D': %.5f%n", dprime); - - System.exit(0); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return null; - } - - @Override - public Integer reduceInit() { - return null; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return null; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java deleted file mode 100755 index 22a2efc34..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java +++ /dev/null @@ -1,84 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.executive.TreeReducer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/14/11 - * Time: 10:14 AM - * To change this template use File | Settings | File Templates. - */ -public class LocusDepthProportionWalker extends LocusWalker implements TreeReducible { - - @Output - PrintStream out; - - private Map samOrder; - - public void initialize() { - samOrder = new HashMap(getToolkit().getSAMFileSamples().size()); - int idx = 0; - out.printf("pos"); - for ( Sample s : getToolkit().getSAMFileSamples() ) { - out.printf("\t"); - out.printf(s.getId()); - samOrder.put(s.getId(),idx++); - } - out.printf("\t%s%n","total"); - } - - public Boolean reduceInit() { return null; } - - public double[] map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( ref == null || ! context.hasBasePileup() || ! context.hasReads() ) { return null; } - - out.print(ref.getLocus()); - double[] props = new double[1+samOrder.size()]; - - // one pass this - int nReads = context.size(); - - for ( PileupElement e : context.getBasePileup() ) { - props[samOrder.get(e.getRead().getReadGroup().getSample())] += 1; - } - - for ( int idx = 0; idx < props.length -1 ; idx ++ ) { - props[idx] /= nReads; - } - - props[props.length-1] = nReads; - - return props; - } - - public Boolean reduce(double[] map, Boolean pr) { - if ( map == null ) { return null; } - - StringBuffer buf = new StringBuffer(); - for ( double d : map ) { - buf.append("\t"); - buf.append(String.format("%.4f",d)); - } - - out.printf("%s%n",buf.toString()); - - return null; - } - - public Boolean treeReduce(Boolean a, Boolean b) { return null; } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java deleted file mode 100755 index 8256c3cdb..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java +++ /dev/null @@ -1,54 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; - -import java.io.File; -import java.util.List; - -/** - * Argument collection for the read feature associator and related walkers - */ -public class RFAArgumentCollection { - - @Argument(doc="ReadFeatures you want to test. None specified = all will be tested.",required=false,shortName="f",fullName="Feature") - public List inputFeatures = null; - - @Argument(doc="Size of window on which to perform tests",required=false,fullName="windowSize") - public int windowSize = 50; - - @Argument(doc="Size of the jump between tested windows",required=false,fullName="windowJump") - public int windowJump = 10; - - @Argument(doc="File containing a list of case samples",required=false,shortName="case",fullName="case") - public File caseFile = null; - - @Argument(doc="File containing a list of control samples",required=false,shortName="control",fullName="control") - public File controlFile = null; - - @Argument(doc="Fixed significance level, as a Z-score",required=false,shortName="z",fullName="fixedZ") - public double fixedZ = 6.0; - - @Argument(doc="Insert size below which to flag a read as aberrant",required=false,shortName="LIS",fullName="LowInsertSize") - public int lowInsertSize = 50; - - @Argument(doc="Insert size above which to flag a read as aberrant",required=false,shortName="HIS",fullName="HighInsertSize") - public int highInsertSize = 450; - - @Argument(doc="Significance level for determining whether a sample contains a significant proportion of affected reads",required=false,shortName="sz",fullName="perSampleZ") - public double sampleZThresh = 3.0; - - @Argument(doc="Lower bound for significant proportion of affected reads",shortName="se",fullName="sampleEpsilon",required=false) - public double EPSILON = 0.05; - - @Argument(doc="Number of clipped bases for a read to be considered \"split\"",required=false,shortName="cb",fullName="clippedBases") - public short clippedBases = 4; - - /* todo -- these for a possible "split read" binarification of clipped bases -- if there's a fast way to traverse clipped bases - @Argument(doc="Minimum base quality for a clipped base to be indicative of a split read",required=false,shortName="CLBQ",fullName="CLBQ") - public short clbq = 10; - - @Argument(doc="Minimum base quality sum for clipped bases (as a string) to be indicative of a split read",required=false,shortName="CLBS",fullName="CLBS") - public short clbs=50; - */ -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java deleted file mode 100755 index e8262a33f..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java +++ /dev/null @@ -1,335 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Read feature association walker -- associates read features between dichotomized, or multi-group cohorts - * todo -- need a heuristic to stop doing tests where there is certainly no signal - * todo -- for most features there's a nuisance variable which is the proportion of *paired* reads, perhaps a pair-only setting for read features - */ - -@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class}) -public class RFAWalker extends ReadWalker { - // todo -- this needs to be an argument collection that can get passed around to initialize read features etc - @ArgumentCollection - private RFAArgumentCollection collection = new RFAArgumentCollection(); - - @Output - public PrintStream out; - - Map caseStatus; - - protected List aggregators; // no re-instantiation, use a list to ensure ordering - - protected Iterator locusIterator; - protected GenomeLoc iteratorLoc; - protected GenomeLoc loc; - protected String sample; - private List EMPTY_LIST = new ArrayList(0); - - public void initialize() { - if ( collection.windowSize % collection.windowJump != 0 ) { - throw new UserException("Window size is not divisible by window jump."); - } - - if ( collection.caseFile == null || collection.controlFile == null ) { - throw new UserException("You must provide both a case file (-case) and a control file (-control) each listing those samples belonging to the cohort"); - } - - caseStatus = new HashMap(getToolkit().getSAMFileSamples().size()); - try { - for ( String sample : new XReadLines(collection.caseFile) ) { - caseStatus.put(sample,true); - } - for ( String sample : new XReadLines(collection.controlFile)) { - caseStatus.put(sample,false); - } - - for ( Sample sample : getToolkit().getSAMFileSamples() ) { - if ( ! caseStatus.containsKey(sample.getId())) { - throw new UserException("No case/control status for sample "+sample.getId()); - } - } - - } catch ( FileNotFoundException e ) { - throw new UserException("Unable to open a case/control file",e); - } - - Set> aggregatorSet = getFeatureAggregators(collection.inputFeatures); - Set rfHolder1 = new HashSet(aggregatorSet.size()); - try { - for ( Class featureClass : aggregatorSet ) { - ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(collection); - rfHolder1.add(readFeature); - } - } catch ( Exception e ) { - throw new StingException("A read feature instantiation error occurred during initialization",e); - } - - ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; - int idx = 0; - for ( ReadFeatureAggregator f : rfHolder1 ) { - rfHolder2[idx++] = f; - } - Arrays.sort(rfHolder2, new Comparator() { - @Override - public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { - return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); - } - }); - aggregators = Arrays.asList(rfHolder2); - - writeHeader(); - - locusIterator = getToolkit().getIntervals().iterator(); - iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; - - } - - public RFWindow reduceInit() { - Set samples = new HashSet(getToolkit().getSamples().size()); - for ( Sample s : getToolkit().getSamples() ) { - samples.add(s.getId()); - } - return new RFWindow(aggregators,collection,caseStatus,getToolkit().getGenomeLocParser()); - } - - public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( ref == null ) { return null; } // unmapped reads have null ref contexts - //loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); - GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen - if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) { - loc = newLoc; - } else { - loc = iteratorLoc.getStartLocation(); - } - if ( read == null ) { return null; } - sample = read.getReadGroup().getSample(); - return read; - } - - public RFWindow reduce(SAMRecord read, RFWindow prevReduce) { - if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval - //logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString())); - onIntervalDone(prevReduce); - iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; - if ( loc.startsBefore(iteratorLoc) ) { - loc = iteratorLoc.getStartLocation(); - } - reduce(read,prevReduce); - } else if ( read != null ) { - // todo -- what happens if first read of an interval is not before or at the start of the interval?\ - List>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc); - // todo -- run tests here; for now just log that a window/multiple windows are complete - if ( completed.size() > 0 ) { - // System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size()) - // bed format - int locShift = 0; - for ( Pair>> samWindow : completed ) { - GenomeLoc window = samWindow.first; - runWindowTests(samWindow.second, window); - locShift += collection.windowJump; - } - } - } - return prevReduce; - } - - - public RFWindow onIntervalDone(RFWindow rWindow) { - //logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString()); - List>>> completed = rWindow.flush(iteratorLoc); - int locShift = 0; - for ( Pair>> samWindow : completed ) { - GenomeLoc window = samWindow.first; - runWindowTests(samWindow.second, window); - locShift += collection.windowJump; - } - - return rWindow; - } - - public Set> getFeatureAggregators(List requestedFeatures) { - HashSet> newFeatureSet = new HashSet>(); - List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); - - if ( collection.inputFeatures == null ) { - newFeatureSet.addAll(availableFeatures); - return newFeatureSet; - } - - - Map> classNameToClass = new HashMap>(collection.inputFeatures.size()); - for ( Class clazz : availableFeatures ) { - classNameToClass.put(clazz.getSimpleName(),clazz); - } - - for ( String s : requestedFeatures) { - if ( classNameToClass.containsKey(s) ) { - newFeatureSet.add(classNameToClass.get(s)); - } else { - throw new UserException("The name "+s+" does not correspond to an available read feature class."); - } - } - - return newFeatureSet; - } - - public void runWindowTests(Map> window, GenomeLoc loc) { - // two main tests: fixed-significance shift, and confidence-interval sum - //System.out.printf("Running tests...%n"); - // todo -- really the aggregators should be iterated over directly (rather than indirectly through the index) - out.printf("%s\t%d\t%d",loc.getContig(),loc.getStart(),loc.getStop()); - for ( int agIdx = 0; agIdx < aggregators.size(); agIdx ++ ) { - double fixedDelta = fixedSignificance(window.get("case").get(agIdx),window.get("control").get(agIdx)); - //double weightedDelta = confidenceIntervalSum(window,agIdx); - //out.printf("\t%.2e\t%.2e",Double.isNaN(fixedDelta) ? 0.0 : fixedDelta,weightedDelta); - Pair,List> caseControlAffected = getAffectedSamples(window,agIdx,fixedDelta); - List cases = caseControlAffected.getFirst(); - List controls = caseControlAffected.getSecond(); - out.printf("\t%.2e\t%d:%d\t%.2e\t%s;%s",fixedDelta,cases.size(),controls.size(), MathUtils.binomialProbability(cases.size(),cases.size()+controls.size(),0.5),Utils.join(",",cases),Utils.join(",",controls)); - - } - out.printf("%n"); - } - - public Pair,List> getAffectedSamples(Map> aggregators, int idx, double fixedDelta) { - if ( fixedDelta == 0.0 ) { return new Pair,List>(EMPTY_LIST,EMPTY_LIST); } // todo -- too hacky - - Pair,List> ccSampleList = new Pair,List>(new ArrayList(), new ArrayList()); - for ( Map.Entry> entry : aggregators.entrySet() ) { - if ( entry.getKey().equals("case") || entry.getKey().equals("control")) { continue; } - ReadFeatureAggregator aggregator = entry.getValue().get(idx); - // is this sending a truly significant signal - double zs = (aggregator.getMean() - collection.EPSILON) * Math.sqrt(aggregator.getnReads())/Math.sqrt(aggregator.getUnbiasedVar()); - if ( zs > collection.sampleZThresh ) { - if ( caseStatus.get(entry.getKey()) ) { - ccSampleList.first.add(entry.getKey()); - } else { - ccSampleList.second.add(entry.getKey()); - } - } - } - - return ccSampleList; - } - - public double fixedSignificance(ReadFeatureAggregator caseAg, ReadFeatureAggregator controlAg) { - if ( caseAg.getnReads() == 0 || controlAg.getnReads() == 0 ) { - return 0.0; - } - double stat_num = caseAg.getMean() - controlAg.getMean(); - double stat_denom = Math.sqrt(caseAg.getUnbiasedVar()/caseAg.getnReads() + controlAg.getUnbiasedVar()/controlAg.getnReads()); - double stat = stat_num/stat_denom; - //System.out.printf("Mean_dif: %.2e Var: %.2e Stat: %.2f Z: %.2f SS-ZZ: %.2f%n",stat_num,stat_denom,stat, collection.fixedZ, stat*stat-collection.fixedZ*collection.fixedZ); - if ( stat*stat < collection.fixedZ*collection.fixedZ ) { - return 0.0; - } else { - //System.out.printf("Calculating delta: %.2f%n",(stat < 0) ? stat_denom*(-1*collection.fixedZ-stat) : stat_denom*(stat-collection.fixedZ)); - //return (stat > 0) ? stat_denom*(stat-collection.fixedZ) : stat_denom*(stat+collection.fixedZ); - return stat_num; - } - } - - public double confidenceIntervalSum(Map> window, int offset) { - // this comment serves as an explicit normality assumption (e.g. that the DF will be large) - double caseWeightMean = 0.0; - double caseWeightVar = 0.0; - double caseSumWeight = 0.0; - double controlWeightMean = 0.0; - double controlWeightVar = 0.0; - double controlSumWeight = 0.0; - for ( Map.Entry> sampleEntry : window.entrySet() ) { - if ( ! sampleEntry.getKey().equals("case") && ! sampleEntry.getKey().equals("control") ) { - // first check if the sample is shifted from zero (CI does not include zero) - // todo -- fixme. This will always be true for insert sizes, clipped reads, mapping quality...should have an avg value - - ReadFeatureAggregator aggregator = sampleEntry.getValue().get(offset); - if ( aggregator.getnReads() == 0 ) { - continue; - } - - boolean shifted; - int shiftThresh = 0;/* - // todo -- this is fucking awful - if ( aggregator instanceof InsertSize ) { - shiftThresh = 150; - } - if ( aggregator instanceof ClippedBases ) { - shiftThresh = 6; - }*/ - if ( aggregator.getMean() < shiftThresh ) { - // use fixedZ/4 to be a little bit less strenuous -- todo -- make an input? - shifted = aggregator.getMean() + collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() < shiftThresh; - } else { - shifted = aggregator.getMean() - collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() > shiftThresh;; - } - if ( shifted ) { - double twoS2 = 2*aggregator.getUnbiasedVar()*aggregator.getUnbiasedVar(); - if ( caseStatus.get(sampleEntry.getKey())) { - caseWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); - caseWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); - caseSumWeight += (aggregator.getnReads()-1.0)/(twoS2); - } else { - controlWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); - controlWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); - controlSumWeight += (aggregator.getnReads()-1.0)/(twoS2); - } - } - } - } - - double caseGaussianMean = caseWeightMean/caseSumWeight; - double controlGaussianMean = controlWeightMean/controlSumWeight; - double caseGaussianVar = caseWeightVar/(caseSumWeight*caseSumWeight); - double controlGaussianVar = controlWeightVar/(controlSumWeight*controlSumWeight); - // todo -- is the z-factor an appropriate statistic? - //return 1.0 - 3*(caseGaussianVar+controlGaussianVar)/Math.abs(caseGaussianMean-controlGaussianMean); - if ( caseGaussianMean > controlGaussianMean ) { - // want to examine the case lower fixedZ*stdev vs the control upper fixedZ*stev - return (caseGaussianMean-collection.fixedZ/4*Math.sqrt(caseGaussianVar)) - (controlGaussianMean + collection.fixedZ/4*Math.sqrt(controlGaussianVar)); - } else { - // want to examine the case upper fixedZ*stev vs the control lower fixedZ*stev - return (controlGaussianMean-collection.fixedZ/4*Math.sqrt(controlGaussianVar)) - ( caseGaussianMean + collection.fixedZ/4*Math.sqrt(caseGaussianVar)); - } - } - - public void writeHeader() { - // "%.2e\t%d:%d\t%s,%s\t%.2e" - StringBuffer buf = new StringBuffer(); - buf.append("description=chr,start,stop"); - for ( ReadFeatureAggregator f : aggregators ) { - buf.append(","); - buf.append(f.getClass().getSimpleName()); - buf.append("-d,"); - buf.append(f.getClass().getSimpleName()); - buf.append("-r,"); - buf.append(f.getClass().getSimpleName()); - buf.append("-s,"); - buf.append(f.getClass().getSimpleName()); - buf.append("-p"); - } - out.printf("track type=bedTable %s%n",buf); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java deleted file mode 100755 index 37dd900ec..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java +++ /dev/null @@ -1,105 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/23/11 - * Time: 5:22 PM - * To change this template use File | Settings | File Templates. - */ -public class RFCombineWalker extends RodWalker { - - private static final String FIRST_COL = "chrm:start-stop"; - - @Output - PrintStream out; - - private List order; - - private GenomeLoc prevLoc; - - public void initialize() { - order = new ArrayList(getToolkit().getRodDataSources().size()); - StringBuffer header = new StringBuffer(); - header.append(FIRST_COL); - for ( ReferenceOrderedDataSource rSource : getToolkit().getRodDataSources() ) { - if ( rSource.getRecordType().isAssignableFrom(TableFeature.class) ) { - //System.out.println(rSource.getHeader().toString()); - for ( String entry : (Collection) rSource.getHeader() ) { - if ( ! entry.startsWith("HEADER") ) { - header.append("\t"); - header.append(entry); - } - } - order.add(rSource.getName()); - } - } - - out.printf("%s%n",header); - - prevLoc = null; - } - - public Object reduceInit() { return null; } - - public Object map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ref == null ) { return null;} - - GenomeLoc loc = null; - boolean needPrint = false; - List eventBySample = new ArrayList(); - - for ( String rodName : order ) { - List namedMD = tracker.getReferenceMetaData(rodName,true); - TableFeature feature = null; - if ( namedMD.size() > 0 ) { - feature = namedMD.get(0) instanceof TableFeature ? (TableFeature) namedMD.get(0) : null; - } - - if ( feature == null ) { throw new StingException("This should be an instance of TableFeature, no?"); } - - loc = feature.getLocation(); - if ( prevLoc != null && loc.equals(prevLoc) ) { - break; - } - - for ( String s : feature.getAllValues().subList(1,feature.getAllValues().size()) ) { - boolean has = ! (s.charAt(0) == '0'); - eventBySample.add(s); - needPrint |= has; - } - } - - if ( needPrint && (loc != null)) { - out.printf("%s",loc.toString()); - for ( String s : eventBySample ) { - out.printf("\t%s", s); - } - out.printf("%n"); - } - - prevLoc = loc; - - return null; - - } - - public Object reduce(Object map, Object reduce) { return null; } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java deleted file mode 100755 index c146311cd..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java +++ /dev/null @@ -1,126 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * A (currently very lame) utility to dump read feature information to a file for validity checking of read feature extractors. - * Soon to be extended for aggregator and window validation; as well as postspective signal analysis. - */ -public class RFDumperWalker extends ReadWalker { - @ArgumentCollection - private RFAArgumentCollection collection = new RFAArgumentCollection(); - - List aggregators; - GenomeLoc loc; - boolean paired; - String sample; - String name; - - String[] data; - - @Output - PrintStream out; - - public void initialize() { - - Set> aggregatorSet = getFeatureAggregators(collection.inputFeatures); - Set rfHolder1 = new HashSet(aggregatorSet.size()); - try { - for ( Class featureClass : aggregatorSet ) { - ReadFeatureAggregator readFeature = featureClass.getConstructor(collection.getClass()).newInstance(collection); - rfHolder1.add(readFeature); - } - } catch ( Exception e ) { - throw new StingException("A read feature instantiation error occurred during initialization",e); - } - - ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; - int idx = 0; - for ( ReadFeatureAggregator f : rfHolder1 ) { - rfHolder2[idx++] = f; - } - Arrays.sort(rfHolder2, new Comparator() { - @Override - public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { - return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); - } - }); - aggregators = Arrays.asList(rfHolder2); - for ( ReadFeatureAggregator ag : aggregators ) { - logger.info(ag.getClass().getSimpleName()); - } - - data = new String[aggregators.size()]; - } - - public Set> getFeatureAggregators(List requestedFeatures) { - HashSet> newFeatureSet = new HashSet>(); - List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); - - if ( collection.inputFeatures == null ) { - newFeatureSet.addAll(availableFeatures); - return newFeatureSet; - } - - - Map> classNameToClass = new HashMap>(collection.inputFeatures.size()); - for ( Class clazz : availableFeatures ) { - classNameToClass.put(clazz.getSimpleName(),clazz); - } - - for ( String s : requestedFeatures) { - if ( classNameToClass.containsKey(s) ) { - newFeatureSet.add(classNameToClass.get(s)); - } else { - throw new UserException("The name "+s+" does not correspond to an available read feature class."); - } - } - - return newFeatureSet; - } - - public ReadFeatureWindow reduceInit() { return null; } - - public String[] map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - // TODO: THIS WILL BREAK IF FEATURE REQUIRES PAIRED READ - if ( ref == null ) { return null; } // unmapped reads have null ref contexts - loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); - sample = read.getReadGroup().getSample(); - name = read.getReadName(); - paired = read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); - int idx = 0; - for ( ReadFeatureAggregator aggregator : aggregators) { - data[idx++] = String.format("%s: %s",aggregator.getClass().getSimpleName(),aggregator.parseStr(read)); - } - return data; - } - - public ReadFeatureWindow reduce(String[] map, ReadFeatureWindow prevReduce) { - if ( map == null ) { return null; } - StringBuffer fStrBuilder = new StringBuffer(); - for ( String f : map ) { - fStrBuilder.append("\t"); - fStrBuilder.append(f); - } - - out.printf("%s\t%s\t%s%s%n",loc,name,sample,fStrBuilder); - - return null; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java deleted file mode 100755 index dfc5acc3b..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java +++ /dev/null @@ -1,243 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ClippedBases; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.InsertSize; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/12/11 - * Time: 1:48 PM - * To change this template use File | Settings | File Templates. - */ -@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class}) -@By(DataSource.REFERENCE) -public class RFExtractorWalker extends ReadWalker { - - @ArgumentCollection - public RFAArgumentCollection rfaArgs = new RFAArgumentCollection(); - - @Argument(doc="Set the marker threshold to this.",shortName="mmt",fullName="markerModeThreshold",required=false) - public double markerModeThreshold = 0.05; - - @Argument(doc="Turn on marker mode (1 if sample is significantly greater than the threshold, 0 otherwise)",shortName="mm",fullName="markerMode",required=false) - public boolean markerMode = false; - - @Argument(doc="Turn on raw count mode: output will be raw aberrant read counts",shortName="c",fullName="count",required=false) - public boolean countMode = false; - - - @Output - public PrintStream out; - - protected Iterator locusIterator; - protected GenomeLoc iteratorLoc; - protected GenomeLoc loc; - protected String sample; - - public void initialize() { - if ( rfaArgs.windowSize % rfaArgs.windowJump != 0 ) { - throw new UserException("Window size is not divisible by window jump."); - } - - if ( markerMode && markerModeThreshold < 0.0 ) { - throw new UserException("Cannot have a negative threshold when using marker mode"); - } - - if ( countMode && markerMode ) { - throw new UserException("Cannot be both in count mode and marker mode"); - } - - locusIterator = getToolkit().getIntervals().iterator(); - iteratorLoc = locusIterator.next(); - } - - public RFWindow reduceInit() { - Map allCase = new HashMap(getToolkit().getSamples().size()); - for ( Sample s : getToolkit().getSAMFileSamples() ) { - allCase.put(s.getId(),true); - if ( s.getId() == null || s.getId().equals("null") ) { - throw new StingException("Sample IDs must not be null... " + s.toString() + " " + Boolean.toString(s.hasSAMFileEntry())); - } - } - - Set> aggregatorSet = getFeatureAggregators(rfaArgs.inputFeatures); - Set rfHolder1 = new HashSet(aggregatorSet.size()); - try { - for ( Class featureClass : aggregatorSet ) { - ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(rfaArgs); - rfHolder1.add(readFeature); - } - } catch ( Exception e ) { - throw new StingException("A read feature instantiation error occurred during initialization",e); - } - - ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; - int idx = 0; - for ( ReadFeatureAggregator f : rfHolder1 ) { - rfHolder2[idx++] = f; - } - Arrays.sort(rfHolder2, new Comparator() { - @Override - public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { - return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); - } - }); - - List aggregators = Arrays.asList(rfHolder2); - - out.printf("HEADERchrm:start-stop"); - for ( String s : allCase.keySet() ) { - for ( ReadFeatureAggregator rfa : aggregators ) { - out.printf("\t%s.%s",s,rfa.getClass().getSimpleName()); - } - } - out.printf("%n"); - - return new RFWindow(aggregators,rfaArgs,allCase,getToolkit().getGenomeLocParser()); - } - - public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( ref == null ) { return null; } // unmapped reads have null ref contexts - //loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); - GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen - if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) { - loc = newLoc; - } else { - loc = iteratorLoc.getStartLocation(); - } - if ( read == null ) { return null; } - sample = read.getReadGroup().getSample(); - return read; - } - - public RFWindow reduce(SAMRecord read, RFWindow prevReduce) { - if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval - //logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString())); - onIntervalDone(prevReduce); - iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; - if ( loc.startsBefore(iteratorLoc) ) { - loc = iteratorLoc.getStartLocation(); - } - reduce(read,prevReduce); - } else if ( read != null ) { - // todo -- what happens if first read of an interval is not before or at the start of the interval? - - List>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc); - if ( completed.size() > 0 ) { - // System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size()) - // bed format - for ( Pair>> samWindow : completed ) { - GenomeLoc window = samWindow.first; - /*if ( prevPrint == null ) { - prevPrint = window; - } else if ( window.startsBefore(prevPrint) ) { - throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString())); - } else { - prevPrint = window; - }*/ - out.printf("%s",window.toString()); - for ( Map.Entry> samEntry : samWindow.second.entrySet() ) { - for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) { - if ( ! markerMode && ! countMode ) { - out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads()); - } else if ( markerMode ) { - out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0); - } else if ( countMode ) { - out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads())); - } - } - } - out.printf("%n"); - } - } - } else { - prevReduce.inc(null,loc,null,iteratorLoc); - } - return prevReduce; - } - - public RFWindow onIntervalDone(RFWindow rWindow) { - //logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString()); - List>>> completed = rWindow.flush(iteratorLoc); - for ( Pair>> samWindow : completed ) { - GenomeLoc window = samWindow.first; - /*if ( prevPrint == null ) { - prevPrint = window; - } else if ( window.startsBefore(prevPrint) ) { - throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString())); - } else { - prevPrint = window; - }*/ - out.printf("%s",window.toString()); - for ( Map.Entry> samEntry : samWindow.second.entrySet() ) { - for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) { - if ( ! markerMode && ! countMode ) { - out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads()); - } else if ( markerMode ) { - out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0); - } else if ( countMode ) { - out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads())); - } - } - } - out.printf("%n"); - } - - return rWindow; - } - - public Set> getFeatureAggregators(List requestedFeatures) { - HashSet> newFeatureSet = new HashSet>(); - List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); - - if ( rfaArgs.inputFeatures == null ) { - newFeatureSet.addAll(availableFeatures); - return newFeatureSet; - } - - - Map> classNameToClass = new HashMap>(rfaArgs.inputFeatures.size()); - for ( Class clazz : availableFeatures ) { - classNameToClass.put(clazz.getSimpleName(),clazz); - } - - for ( String s : requestedFeatures) { - if ( classNameToClass.containsKey(s) ) { - newFeatureSet.add(classNameToClass.get(s)); - } else { - throw new UserException("The name "+s+" does not correspond to an available read feature class."); - } - } - - return newFeatureSet; - } - - public static boolean hasEvent(ReadFeatureAggregator aggregator, double lowThresh, double sigLevel) { - return (aggregator.getMean() - lowThresh)*Math.sqrt(aggregator.getnReads())/aggregator.getVar() > sigLevel; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java deleted file mode 100755 index ae3f40c77..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java +++ /dev/null @@ -1,243 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - */ -public class RFWindow { - - private RFAArgumentCollection argumentCollection; // the RF Argument Collection from the RF Walker - private List windowStarts; // holds the starting positions of the windows - private Map sampleCohortMap; // identifies case samples ("true") and control samples ("false") - List aggregators; // read feature aggregators to be used (maintained for fast cloning, and ordering) - // feature ---> sample ---> count - List>> aggregatorWindows; // holds the map between samples and features in the current windows - private GenomeLocParser parser; // the active parser, for creating GenomeLocs for empty windows - private GenomeLoc previousLoc; // the previous genome loc within the user interval given to the RFWindow - - /** - * Defines a new window which maps samples to read feature aggregators - * @return - a new element for aggregatorWindows - */ - private Map> newWindow() { - Map> win = new HashMap>(sampleCohortMap.size()); - for ( String s : sampleCohortMap.keySet() ) { - win.put(s,getAggregators()); - } - // todo -- generalize me - win.put("case",getAggregators()); - win.put("control",getAggregators()); - - return win; - } - - /** - * Generates a list of new aggregators to collect data - * @return list of ReadFeatureAggregators to be the value of a new window - */ - private List getAggregators() { - ArrayList newEmptyAgs = new ArrayList(aggregators.size()); - try { - for ( ReadFeatureAggregator ag : aggregators ) { - newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(argumentCollection)); - } - } catch (Exception e) { - throw new StingException("Error instantiating read feature aggregator",e); - } - - return newEmptyAgs; - } - - /** - * A constructor for the RFWindow object - * @param collection - the argument collection, from the walker - * @param cohortMap - the map between sample IDs and whether or not it is a case sample, from the walker - * @param parser - the Genome Loc Parser, from the walker - */ - public RFWindow(List aggregators, RFAArgumentCollection collection, Map cohortMap, GenomeLocParser parser) { - this.argumentCollection = collection; - this.sampleCohortMap = cohortMap; - this.parser = parser; - this.aggregators = aggregators; - } - - /** - * Instantiate the tiled windows of sample -> List maps, filling in empty windows up to the one which contains the - * provided genomeloc if necessary. - * @param loc - the location to fill up to, usually the starting position of a read or an interval - * @param currentUserInterval - the current user-provided interval being processed by the walker - */ - @Requires({"! currentUserInterval.isBefore(loc)"}) - public void instantiate(GenomeLoc loc, GenomeLoc currentUserInterval) { - aggregatorWindows = new ArrayList>>(argumentCollection.windowSize/argumentCollection.windowJump); - windowStarts = new ArrayList(argumentCollection.windowSize/argumentCollection.windowJump); - //System.out.printf("Calling fill at %s\t%s%n",loc,currentUserInterval); - this.fill(loc, currentUserInterval); - } - - /** - * Fills the tiled window lists with empty windows up to the one which includes the locus - * todo -- this can take a lot of memory for large intervals instantiated far into them, e.g. - * |---------------------------------------------------------------------| interval - * . <=== locus - * todo -- perhaps a reduced representation for empty windows that were not created but already have expired? - * @param loc - the location to fill up to, usually the starting position of a read or an interval - * @param currentUserInterval - the current user-provided interval being processed by the walker - */ - @Requires({"! currentUserInterval.isBefore(loc)"}) - public void fill(GenomeLoc loc, GenomeLoc currentUserInterval) { - // case -1: window could be empty - if ( windowStarts == null ) { - instantiate(loc,currentUserInterval); - } - - // case 0: if the windows are empty add in the beginning of the interval - if ( windowStarts.size() == 0 ) { - windowStarts.add(currentUserInterval.getStartLocation()); - aggregatorWindows.add(newWindow()); - } - - // case 1: loc is before or within windowJump bases of the current user interval; we need only instantiate the first window - if ( loc.isBefore(currentUserInterval) || loc.distance(currentUserInterval) <= argumentCollection.windowJump ) { - // do nothing at all - } else { - // case 2: loc is somewhere in the middle or at the end of current user interval, need to fill in windows up until then - GenomeLoc nextLoc = windowStarts.get(windowStarts.size()-1); - while ( loc.distance(nextLoc) > argumentCollection.windowJump ) { - //System.out.printf("Filling with nextloc %s%n",nextLoc); - nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump); - windowStarts.add(nextLoc); - aggregatorWindows.add(newWindow()); - } - } - - } - - /** - * Shifts a location from chrZ:X to chrZ:X+jump - * @param loc - location to shift - * @param jump - amount to shift by - * @return loc with start position shifted by jump - */ - @Requires("loc.getStart()+jump < parser.getContigInfo(loc.getContig()).getSequenceLength()") // e.g. that the new loc is not off the end of the contig - private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) { - return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump); - } - - /** - * Fills in missing windows between the previously-seen one and the ending interval, then expires all windows, returning - * them as complete, resetting previousLocation to null so that the next read re-instantiates internal window data - * @param userIntervalEnding - the user interval which is ending - * @return those currently active windows, plus empty interpolating windows between the last active window and the end of the interval - */ - public List>>> flush(GenomeLoc userIntervalEnding) { - // jump in locations -- flush the windows - List>>> complete = new ArrayList>>>(aggregators.size()); - // fill in any uncovered windows - GenomeLoc iStop = userIntervalEnding.getStopLocation(); - //System.out.printf("Calling fill from within flush at %s%n",userIntervalEnding); - fill(iStop,userIntervalEnding); - // now expire all windows, terminating them either at the proper window size, or at the endpoint of the interval if it comes sooner - while ( windowStarts.size() > 0 ) { - Map> cMap = aggregatorWindows.remove(0); - GenomeLoc wsLoc = windowStarts.remove(0); - - GenomeLoc cLoc; - if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) { - cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize); - } else { - cLoc = wsLoc.endpointSpan(iStop); - } - - complete.add(new Pair>>(cLoc,cMap)); - } - - previousLoc = null; // will re-instantiate data upon next loc - - return complete; - } - - /** - * Workhorse method: - * - determines if new windows need be made - * - determines if old windows need be tested & trashed - * - determines which windows need to be updated & updates them - * - * @param loc - starting alignment position of the read - * @param sample - the sample ID from whom the read was sequenced - * @return - those feature window(s) that need be tested (and then baleeted) - */ - public List>>> inc(SAMRecord record, GenomeLoc loc, String sample, GenomeLoc userInterval) { - List>>> complete = new ArrayList>>>(aggregators.size()); - if ( previousLoc == null ) { - // first time, gotta instantiate stuff - instantiate(loc,userInterval); - windowInc(sample,record,aggregatorWindows.get(0)); - } else if ( loc.distance(previousLoc) == 0 ) { - // best and most common case: just update the living windows - for ( Map> win : aggregatorWindows ) { - windowInc(sample,record,win); - } - } else { - // another standard case: we've gone to some further base in the same interval - while ( loc.distance(windowStarts.get(windowStarts.size()-1)) > argumentCollection.windowJump ) { - // careful, don't use the location itself, but add in windows every winJump bases until this condition is not met - //System.out.printf("Adding within inc at %s\t%s%n",loc,userInterval); - windowStarts.add(shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump)); - aggregatorWindows.add(newWindow()); - } - - while ( windowStarts.size() > 0 && loc.distance(windowStarts.get(0)) > argumentCollection.windowSize ) { - Map> cMap = aggregatorWindows.remove(0); - GenomeLoc wsLoc = windowStarts.remove(0); - GenomeLoc iStop = userInterval.getStopLocation(); - GenomeLoc cLoc; - if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) { - cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize); - } else { - cLoc = wsLoc.endpointSpan(iStop); - } - complete.add(new Pair>>(cLoc,cMap)); - } - - for ( Map> win : aggregatorWindows ) { - windowInc(sample,record,win); - } - - } - - previousLoc = loc; - - return complete; - } - - /** - * Incorporate new features into the window - * @param sample - id of sample from which the features come - * @param record - the read - * @param window - the particular window to be updated - */ - private void windowInc(String sample, SAMRecord record, Map> window) { - if ( sample == null || record == null ) { return; } - - for (ReadFeatureAggregator aggregator : window.get(sample) ) { - aggregator.aggregate(record); - } - - for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) { - aggregator.aggregate(record); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java deleted file mode 100755 index b357dec9b..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java +++ /dev/null @@ -1,152 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.*; - -/** - * Workhorse class of read feature association: maintains the active windows, culls the inactive ones and passes - * them back for testing. Windows consist of aggregators, and list indeces link windows to their starting loc which - * determines their active/inactive status. - */ -public class ReadFeatureWindow { - RFAArgumentCollection args; - private List winStarts; - private GenomeLoc previousLoc; - private int winSize; - private int winJump; - private GenomeLocParser parser; - - private Map sampleCohortMap; - List aggregators; - // feature ---> sample ---> count - List>> aggregatorWindows; - - public ReadFeatureWindow(List aggregators, Map cohortMap, RFAArgumentCollection args, GenomeLocParser parser) { - winSize = args.windowSize; - winJump = args.windowJump; - sampleCohortMap = cohortMap; - this.args = args; - this.aggregators =aggregators; - this.parser = parser; - } - - public void instantiate(GenomeLoc loc) { - aggregatorWindows = new ArrayList>>(winSize/winJump); - winStarts = new ArrayList(winSize/winJump); - winStarts.add(loc); - aggregatorWindows.add(newWindow()); - } - - /** - * Workhorse method: - * - determines if new windows need be made - * - determines if old windows need be tested & trashed - * - determines which windows need to be updated & updates them - * - * @param loc - starting alignment position of the read - * @param sample - the sample ID from whom the read was sequenced - * @return - those feature window(s) that need be tested (and then baleeted) - */ - public List>>> inc(SAMRecord record, GenomeLoc loc, String sample) { - List>>> complete = new ArrayList>>>(aggregators.size()); - if ( previousLoc == null ) { - // first time, gotta instantiate stuff - instantiate(loc); - windowInc(sample,record,aggregatorWindows.get(0)); - } else if ( loc.distance(previousLoc) == 0 ) { - // best and most common case: just update the living windows - for ( Map> win : aggregatorWindows ) { - windowInc(sample,record,win); - } - } else { - // another standard case: we've gone to some further base in the same interval - while ( loc.distance(winStarts.get(winStarts.size()-1)) > winJump ) { - // careful, don't use the location itself, but add in windows every winJump bases until this condition is not met - winStarts.add(shiftLocByJump(winStarts.get(winStarts.size()-1),winJump)); - aggregatorWindows.add(newWindow()); - } - - while ( winStarts.size() > 0 && loc.distance(winStarts.get(0)) > winSize ) { - Map> cMap = aggregatorWindows.remove(0); - GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc); - complete.add(new Pair>>(cLoc,cMap)); - } - - for ( Map> win : aggregatorWindows ) { - windowInc(sample,record,win); - } - - } - - previousLoc = loc; - - return complete; - } - - public List>>> flush() { - // jump in locations -- flush the windows - List>>> complete = new ArrayList>>>(aggregators.size()); - while ( winStarts.size() > 0 ) { - Map> cMap = aggregatorWindows.remove(0); - GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc); - complete.add(new Pair>>(cLoc,cMap)); - } - - previousLoc = null; // will re-instantiate data upon next loc - - return complete; - } - - private Map> newWindow() { - Map> win = new HashMap>(sampleCohortMap.size()); - for ( String s : sampleCohortMap.keySet() ) { - win.put(s,getAggregators()); - } - // todo -- generalize me - win.put("case",getAggregators()); - win.put("control",getAggregators()); - - return win; - } - - private List getAggregators() { - ArrayList newEmptyAgs = new ArrayList(aggregators.size()); - try { - for ( ReadFeatureAggregator ag : aggregators ) { - newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(args)); - } - } catch (Exception e) { - throw new StingException("Error instantiating read feature aggregator",e); - } - - return newEmptyAgs; - } - - /** - * Incorporate new features into the window - * @param sample - id of sample from which the features come - * @param record - the read - * @param window - the particular window to be updated - */ - private void windowInc(String sample, SAMRecord record, Map> window) { - if ( sample == null || record == null ) { return; } - - for (ReadFeatureAggregator aggregator : window.get(sample) ) { - aggregator.aggregate(record); - } - - for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) { - aggregator.aggregate(record); - } - } - - private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) { - return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java deleted file mode 100755 index 8172ab5bb..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 1:09 PM - * To change this template use File | Settings | File Templates. - */ -public class AberrantInsertSize extends BinaryFeatureAggregator { - - private int min; - private int max; - - public AberrantInsertSize(RFAArgumentCollection col) { - super(col); - min = col.lowInsertSize; - max = col.highInsertSize; - } - - public Boolean extractFeature(SAMRecord rec) { - return Math.abs(rec.getInferredInsertSize()) > max || Math.abs(rec.getInferredInsertSize()) < min; - } - - public boolean featureDefined(SAMRecord rec) { - return rec.getReadPairedFlag() && rec.getProperPairFlag(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java deleted file mode 100755 index 7a52013f3..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java +++ /dev/null @@ -1,48 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: 5/19/11 - * Time: 12:08 AM - * To change this template use File | Settings | File Templates. - */ -public class BinaryClippedBases extends BinaryFeatureAggregator { - - private short baseLim; - private final byte baseQualLim = 20; - - public Boolean extractFeature(SAMRecord read) { - int firstClippedToAliStart = read.getUnclippedStart()-read.getAlignmentStart(); - int lastUnclippedToReadEnd = read.getUnclippedEnd()-read.getAlignmentEnd(); - - byte[] quals = read.getBaseQualities(); - int nClipped = 0; - for ( int offset = 0; offset < firstClippedToAliStart; offset++ ) { - if ( quals[offset] >= baseQualLim ) { - nClipped++; - } - } - - for ( int offset = quals.length - lastUnclippedToReadEnd; offset < quals.length ; offset++ ) { - if ( quals[offset] >= baseQualLim ) { - nClipped ++; - } - } - - return nClipped >= baseLim; - } - - public boolean featureDefined(SAMRecord rec) { return ! rec.getReadPairedFlag() || Math.abs(rec.getInferredInsertSize()) > 100; } // unpaired or no adaptor sequence - - public BinaryClippedBases(RFAArgumentCollection col) { - super(col); - baseLim = col.clippedBases; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java deleted file mode 100755 index 3c515db1d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 12:52 PM - * To change this template use File | Settings | File Templates. - */ -public abstract class BinaryFeatureAggregator extends ReadFeatureAggregator { - - public BinaryFeatureAggregator(RFAArgumentCollection col) { - super(col); - } - - public void aggregate(Boolean hasFeature) { - // now robustified - mean = ( (hasFeature ? 1 : 0)+nReads*mean)/(++nReads); - var = mean*(1-mean) + Math.pow(2,1-nReads); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java deleted file mode 100755 index dc3f23793..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 1:33 PM - * To change this template use File | Settings | File Templates. - */ -public class ClippedBases { - // todo -- make a binary feature version of this - - public Integer extractFeature(SAMRecord record) { - int nClipped = 0; - - for ( CigarElement e : record.getCigar().getCigarElements() ) { - if ( e.getOperator().equals(CigarOperator.SOFT_CLIP) || e.getOperator().equals(CigarOperator.HARD_CLIP) ) { - nClipped += e.getLength(); - } - } - - return nClipped; - } - - public boolean featureDefined(SAMRecord rec) { return true; } - - public ClippedBases(RFAArgumentCollection col) { - //super(col); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java deleted file mode 100755 index c8d8fcce6..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 12:58 PM - * To change this template use File | Settings | File Templates. - */ -public class InsertSize { - // todo -- this is deprecated by AIS, so the extension is removed. - - public InsertSize(RFAArgumentCollection col) { - //super(col); - } - - protected Integer extractFeature(SAMRecord record) { - return Math.abs(record.getInferredInsertSize()); - } - - protected boolean featureDefined(SAMRecord record) { - return record.getReadPairedFlag() && record.getProperPairFlag(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java deleted file mode 100755 index 7e6c2e1c4..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 1:06 PM - * To change this template use File | Settings | File Templates. - */ -public class MateOtherContig extends BinaryFeatureAggregator { - - public MateOtherContig(RFAArgumentCollection col) { - super(col); - } - - public Boolean extractFeature(SAMRecord record) { - return ! record.getReferenceName().equals(record.getMateReferenceName()); - } - - public boolean featureDefined(SAMRecord read) { - return read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java deleted file mode 100755 index 9b9a6a1fd..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 1:15 PM - * To change this template use File | Settings | File Templates. - */ -public class MateSameStrand extends BinaryFeatureAggregator { - - public Boolean extractFeature(SAMRecord record) { - return record.getReadNegativeStrandFlag() == record.getMateNegativeStrandFlag(); - } - - public boolean featureDefined(SAMRecord record) { - return record.getReadPairedFlag() && ! record.getMateUnmappedFlag() && record.getReferenceIndex().equals(record.getMateReferenceIndex()); - } - - - public MateSameStrand(RFAArgumentCollection col) { - super(col); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java deleted file mode 100755 index bd9fcb1f3..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 1:32 PM - * To change this template use File | Settings | File Templates. - */ -public class MateUnmapped extends BinaryFeatureAggregator { - - public Boolean extractFeature(SAMRecord record) { - return record.getMateUnmappedFlag(); - } - - public boolean featureDefined(SAMRecord record) { - return record.getReadPairedFlag(); - } - - public MateUnmapped(RFAArgumentCollection col) { - super(col); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java deleted file mode 100755 index dfd5fc089..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 12:39 PM - * To change this template use File | Settings | File Templates. - */ -public abstract class NumericFeatureAggregator extends ReadFeatureAggregator { - - private int min; - private int max; - - public NumericFeatureAggregator(RFAArgumentCollection col) { - super(col); - min = -1; - max = -1; - } - - protected void aggregate(Integer datum) { - if ( min == -1 ) { - min = datum; - } else if ( max == -1 ) { - if ( datum > min ) { - max = datum; - } else { - max = min; - min = datum; - } - } else if ( datum > max ) { - update(max); - max = datum; - } else if ( datum < min ) { - update(min); - min = datum; - } else { - update(datum); - } - } - - protected void update(Integer datum) { - double oldMean = mean; - mean += (datum - mean)/(1+nReads); - var = ((nReads*var) + (datum - oldMean)*(datum-mean))/++nReads; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java deleted file mode 100755 index ef1a1d436..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/8/11 - * Time: 11:59 AM - * To change this template use File | Settings | File Templates. - */ -public class ProperPair extends BinaryFeatureAggregator { - - public ProperPair(RFAArgumentCollection collection) { - super(collection); - } - - public Boolean extractFeature(SAMRecord record) { - return record.getProperPairFlag(); - } - - public boolean featureDefined(SAMRecord record) { - return record.getReadPairedFlag(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java deleted file mode 100755 index 4bf167657..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 5/4/11 - * Time: 12:33 PM - * To change this template use File | Settings | File Templates. - */ -public abstract class ReadFeatureAggregator { - - protected double mean; - protected double var; - protected int nReads; - - public ReadFeatureAggregator(RFAArgumentCollection collection) { - init(collection); - mean = 0.0; - var = 0.0; - nReads = 0; - } - - public void aggregate(SAMRecord record) { - if ( featureDefined(record) ) { - aggregate(extractFeature(record)); - } - } - - protected abstract void aggregate(X feature); - - protected abstract boolean featureDefined(SAMRecord record); - - protected abstract X extractFeature(SAMRecord record); - - public double getMean() { return mean; } - public double getVar() { return var; } - public double getUnbiasedVar() { return var*( (double) nReads)/(nReads-1); } - public int getnReads() { return nReads; } - - public void init(RFAArgumentCollection collection) { } - - public X parse(SAMRecord read) { - if ( featureDefined(read) ) { - return extractFeature(read); - } else { - return null; - } - } - - public String parseStr(SAMRecord read) { - if ( featureDefined(read) ) { - return extractFeature(read).toString(); - } else { - return "undefined"; - } - } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java deleted file mode 100755 index c0dce0d46..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.PrintStream; -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}) - -public class CalcFullHaplotypesWalker extends RodWalker { - private Map waitingHaplotypes = null; - - @Output(doc = "File to which results should be written", required = true) - protected PrintStream out; - - @Argument(doc = "sample to emit", required = false) - protected String sample = null; - - @Argument(doc = "only include physically-phased results", required = false) - protected boolean requirePQ = false; - - public void initialize() { - this.waitingHaplotypes = new HashMap(); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), null); - for (VCFHeader header : rodNameToHeader.values()) { - for (String sample : header.getGenotypeSamples()) - waitingHaplotypes.put(sample, null); - } - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - GenomeLoc curLocus = ref.getLocus(); - outputDoneHaplotypes(curLocus); - - int curPosition = curLocus.getStop(); - int prevPosition = curPosition - 1; - - // Extend the haplotypes to include up to this position (BUT EXCLUSIVE OF THIS POSITION): - for (Map.Entry sampleHapEntry : waitingHaplotypes.entrySet()) { - Haplotype waitingHaplotype = sampleHapEntry.getValue(); - - if (waitingHaplotype == null) {// changed to a new contig: - // Set the new haplotype to extend from [1, prevPosition] - if (prevPosition >= 1) { - GenomeLoc startInterval = getToolkit().getGenomeLocParser().createGenomeLoc(curLocus.getContig(), 1, prevPosition); - waitingHaplotype = new Haplotype(startInterval, sampleHapEntry.getKey()); - sampleHapEntry.setValue(waitingHaplotype); - } - } - else - waitingHaplotype.extend(prevPosition); - } - - Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); - for (VariantContext vc : vcs) { - if (vc.isFiltered()) - continue; - - if (sample != null) - vc = vc.subContextFromGenotypes(vc.getGenotype(sample)); - - for (Map.Entry sampleGtEntry : vc.getGenotypes().entrySet()) { - String sample = sampleGtEntry.getKey(); - Genotype gt = sampleGtEntry.getValue(); - - if (gt.isHet()) { - Haplotype sampleHap = waitingHaplotypes.get(sample); - if (sampleHap == null) - throw new ReviewedStingException("EVERY sample should have a haplotype [by code above and getToolkit().getSamples()]"); - - // Terminate the haplotype before here: - if (!gt.isPhased() || (requirePQ && !gt.hasAttribute(ReadBackedPhasingWalker.PQ_KEY))) { - outputHaplotype(sampleHap); - - // Start a new haplotype from the current position: - sampleHap = new Haplotype(curLocus, sample); - waitingHaplotypes.put(sample, sampleHap); - } - else { - sampleHap.extend(curPosition); - } - - sampleHap.incrementHetCount(); - } - } - } - - return 1; - } - - public Integer reduce(Integer addIn, Integer runningCount) { - if (addIn == null) - addIn = 0; - - return runningCount + addIn; - } - - private void outputDoneHaplotypes(GenomeLoc curLocus) { - for (Map.Entry sampleHapEntry : waitingHaplotypes.entrySet()) { - Haplotype waitingHaplotype = sampleHapEntry.getValue(); - - if (waitingHaplotype != null) { - if (curLocus == null || !waitingHaplotype.interval.onSameContig(curLocus)) { - sampleHapEntry.setValue(null); - - // Set the output haplotype to terminate at the end of its contig: - int contigLength = getContigLength(waitingHaplotype.interval.getContig()); - waitingHaplotype.extend(contigLength); - outputHaplotype(waitingHaplotype); - } - } - } - } - - private int getContigLength(String contig) { - return getToolkit().getGenomeLocParser().getContigInfo(contig).getSequenceLength(); - } - - private void outputHaplotype(Haplotype h) { - out.println(h); - } - - /** - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(Integer result) { - outputDoneHaplotypes(null); - - System.out.println("map was called " + result + " times."); - } - - private class Haplotype { - public GenomeLoc interval; - public String sample; - public int hetCount; - - public Haplotype(GenomeLoc interval, String sample) { - this.interval = interval; - this.sample = sample; - this.hetCount = 0; - } - - public void extend(int stop) { - if (stop > interval.getStop()) - interval = getToolkit().getGenomeLocParser().createGenomeLoc(interval.getContig(), interval.getStart(), stop); - } - - public void incrementHetCount() { - hetCount++; - } - - public String toString() { - return sample + "\t" + interval.toString() + "\t" + interval.size() + "\t" + hetCount; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java deleted file mode 100755 index a385f870d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java +++ /dev/null @@ -1,532 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.phasing.AllelePair; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; -import org.broadinstitute.sting.gatk.walkers.phasing.WriteVCF; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.GenotypePhasingEvaluator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.PrintStream; -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = ComparePhasingToTrioPhasingNoRecombinationWalker.TRIO_ROD_NAME, type = ReferenceOrderedDatum.class), @RMD(name = ComparePhasingToTrioPhasingNoRecombinationWalker.PHASING_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -@ReadFilters({ZeroMappingQualityReadFilter.class}) -// Filter out all reads with zero mapping quality - -public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker { - public final static String TRIO_ROD_NAME = "trio"; - public final static String PHASING_ROD_NAME = "phasing"; - - private final static int NUM_IN_TRIO = 3; - - private final static int DIPLOID = 2; - - @Output - protected PrintStream out; - - @Argument(fullName = "trioAugmentedPhasing", shortName = "trioAugmentedPhasing", doc = "File to which trio-phased variants should be written", required = false) - protected VCFWriter writer = null; - - @Argument(fullName = "diffTrioAndPhasingTracks", shortName = "diffTrioAndPhasingTracks", doc = "File to which comparisons of phasing information in 'trio' and 'phasing' tracks should be written", required = false) - protected PrintStream diffTrioAndPhasingTracks = null; - - private CompareTrioAndPhasingTracks diffTrioAndPhasingCounts = null; - - private String phasingSample = null; - - private enum TrioStatus { - PRESENT, MISSING, TRIPLE_HET - } - - private GenomeLoc prevLoc = null; - private VariantContext prevTrioVc = null; - private TrioStatus prevTrioStatus = TrioStatus.MISSING; - - private Genotype prevPhasingGt = null; - - - public void initialize() { - initializeVcfWriter(); - - // Will compare the phasing ALREADY present in the trio track [without regards to what this trio phasing mechanism (without recombination) would do]: - if (diffTrioAndPhasingTracks != null) - diffTrioAndPhasingCounts = new CompareTrioAndPhasingTracks(); - } - - private void initializeVcfWriter() { - if (writer == null) - return; - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - List rodNames = new LinkedList(); - rodNames.add(PHASING_ROD_NAME); - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - Set samples = new TreeSet(rodNameToHeader.get(PHASING_ROD_NAME).getGenotypeSamples()); - writer.writeHeader(new VCFHeader(hInfo, samples)); - } - - public boolean generateExtendedEvents() { - return false; - } - - public CompareToTrioPhasingStats reduceInit() { - return new CompareToTrioPhasingStats(); - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public CompareResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - GenomeLoc curLoc = ref.getLocus(); - VariantContext phasingVc = tracker.getVariantContext(ref, PHASING_ROD_NAME, curLoc); - - CompareToTrioPhasingStats stats = new CompareToTrioPhasingStats(); - CompareResult result = new CompareResult(phasingVc, stats); - - if (phasingVc == null || phasingVc.isFiltered()) - return result; - - Map phasingSampleToGt = phasingVc.getGenotypes(); - if (phasingSampleToGt.size() != 1) - throw new UserException("Must provide EXACTLY one sample in " + PHASING_ROD_NAME + " track!"); - Map.Entry phasingSampGt = phasingSampleToGt.entrySet().iterator().next(); - String sample = phasingSampGt.getKey(); - if (phasingSample == null) - phasingSample = sample; - if (!sample.equals(phasingSample)) - throw new UserException("Must provide EXACTLY one sample!"); - Genotype curPhasingGt = phasingSampGt.getValue(); - if (!curPhasingGt.isHet()) - return result; - - VariantContext curTrioVc = tracker.getVariantContext(ref, TRIO_ROD_NAME, curLoc); - boolean useTrioVc = (curTrioVc != null && !curTrioVc.isFiltered()); - - Genotype sampleCurGtInTrio = null; - if (useTrioVc) { - sampleCurGtInTrio = curTrioVc.getGenotype(phasingSample); - - if (curTrioVc.getNSamples() > NUM_IN_TRIO || sampleCurGtInTrio == null) - throw new UserException("Must provide trio data for sample: " + phasingSample); - - if (!curPhasingGt.sameGenotype(sampleCurGtInTrio)) { - logger.warn("Locus " + curLoc + " breaks phase, since " + PHASING_ROD_NAME + " and " + TRIO_ROD_NAME + " tracks have different genotypes for " + phasingSample + "!"); - prevLoc = null; - return result; - } - } - - // Now, we have a [trio-consistent] het genotype that may be phased or not [and we want to know if it could be phased based on trio information]: - int processed = 1; - - TrioStatus currentTrioStatus = TrioStatus.MISSING; - if (useTrioVc && curTrioVc.getNSamples() == NUM_IN_TRIO) { - boolean allHet = true; - for (int i = 0; i < NUM_IN_TRIO; i++) { - if (!curTrioVc.getGenotype(i).isHet()) { - allHet = false; - break; - } - } - - if (allHet) - currentTrioStatus = TrioStatus.TRIPLE_HET; - else - currentTrioStatus = TrioStatus.PRESENT; - } - - if (prevLoc != null && curLoc.onSameContig(prevLoc)) { - String trioPhaseStatus; - stats.comparedSites++; - String addToOutput = ""; - - if (prevTrioStatus == TrioStatus.TRIPLE_HET || currentTrioStatus == TrioStatus.TRIPLE_HET) { - trioPhaseStatus = "Het3"; - } - else if (prevTrioStatus == TrioStatus.MISSING || currentTrioStatus == TrioStatus.MISSING) { - trioPhaseStatus = "Missing"; - } - else { - if (prevTrioStatus != TrioStatus.PRESENT || currentTrioStatus != TrioStatus.PRESENT) - throw new ReviewedStingException("LOGICAL error: prevTrioStatus != TrioStatus.PRESENT || currentTrioStatus != TrioStatus.PRESENT"); - - trioPhaseStatus = "trio_phased"; - stats.trioPhaseableSites++; - - if (writer != null) { // Phase the genotypes using the trio information: - String parent1 = null; - String parent2 = null; - for (Map.Entry trioEntry : curTrioVc.getGenotypes().entrySet()) { - String trioSample = trioEntry.getKey(); - if (trioEntry.getValue().getPloidy() != DIPLOID) - throw new UserException("Each sample in trio must be diploid!"); - if (trioSample.equals(phasingSample)) - continue; - - if (parent1 == null) - parent1 = trioSample; - else if (parent2 == null) - parent2 = trioSample; - else - throw new ReviewedStingException("Cannot be more than 2 parents in TRIO!"); - } - if (parent1 == null || parent2 == null) - throw new ReviewedStingException("Must have 2 parents in TRIO!"); - - Genotype samplePrevGtInTrio = prevTrioVc.getGenotype(phasingSample); - - Genotype parent1PrevGt = prevTrioVc.getGenotype(parent1); - Genotype parent1CurGt = curTrioVc.getGenotype(parent1); - - Genotype parent2PrevGt = prevTrioVc.getGenotype(parent2); - Genotype parent2CurGt = curTrioVc.getGenotype(parent2); - - int prevHomIndex, prevOtherIndex; - Allele prevHomAllele; - Set prevOtherAlleles; - if (parent1PrevGt.isHom()) { - prevHomIndex = 1; - prevOtherIndex = 2; - prevHomAllele = parent1PrevGt.getAllele(0); - prevOtherAlleles = new TreeSet(parent2PrevGt.getAlleles()); - } - else if (parent2PrevGt.isHom()) { - prevHomIndex = 2; - prevOtherIndex = 1; - prevHomAllele = parent2PrevGt.getAllele(0); - prevOtherAlleles = new TreeSet(parent1PrevGt.getAlleles()); - } - else - throw new ReviewedStingException("LOGICAL ERROR: at least one parent is hom!"); - - int curHomIndex, curOtherIndex; - Allele curHomAllele; - Set curOtherAlleles; - if (parent1CurGt.isHom()) { - curHomIndex = 1; - curOtherIndex = 2; - curHomAllele = parent1CurGt.getAllele(0); - curOtherAlleles = new TreeSet(parent2CurGt.getAlleles()); - } - else if (parent2CurGt.isHom()) { - curHomIndex = 2; - curOtherIndex = 1; - curHomAllele = parent2CurGt.getAllele(0); - curOtherAlleles = new TreeSet(parent1CurGt.getAlleles()); - } - else - throw new ReviewedStingException("LOGICAL ERROR: at least one parent is hom!"); - - boolean phased = true; - - Map prevAlleleToParent = new TreeMap(); - for (Allele prevAllele : samplePrevGtInTrio.getAlleles()) { - if (prevAllele.equals(prevHomAllele)) - prevAlleleToParent.put(prevAllele, prevHomIndex); - else if (prevOtherAlleles.contains(prevAllele)) - prevAlleleToParent.put(prevAllele, prevOtherIndex); - else { - logger.warn("CANNOT phase, due to inconsistent inheritance of alleles!"); - phased = false; - break; - } - } - - Map parentToCurAllele = new HashMap(); - for (Allele curAllele : sampleCurGtInTrio.getAlleles()) { - if (curAllele.equals(curHomAllele)) - parentToCurAllele.put(curHomIndex, curAllele); - else if (curOtherAlleles.contains(curAllele)) - parentToCurAllele.put(curOtherIndex, curAllele); - else { - logger.warn("CANNOT phase, due to inconsistent inheritance of alleles!"); - phased = false; - break; - } - } - - if (phased) { - List phasedCurAlleles = new LinkedList(); - for (Allele prevAllele : prevPhasingGt.getAlleles()) { - Integer prevIndex = prevAlleleToParent.get(prevAllele); - if (prevIndex == null) - throw new ReviewedStingException("LOGICAL error: expecting to find prev allele in trio parents"); - Allele curAllele = parentToCurAllele.get(prevIndex); - if (curAllele == null) - throw new ReviewedStingException("LOGICAL error: expecting to find cur allele in trio parents"); - phasedCurAlleles.add(curAllele); - } - - boolean useTrioPhase = true; - Genotype phasedGt = new Genotype(phasingSample, phasedCurAlleles, curPhasingGt.getNegLog10PError(), curPhasingGt.getFilters(), curPhasingGt.getAttributes(), phased); - - if (curPhasingGt.isPhased()) { - stats.bothCanPhase++; - useTrioPhase = false; - - boolean ignorePhase = false; - if (!phasedGt.sameGenotype(curPhasingGt, ignorePhase)) { - String contradictMessage = "Phase from " + PHASING_ROD_NAME + " track at " + curLoc + " contradicts the trio-based phasing."; - stats.contradictoryPhaseSites++; - addToOutput += "\tcontradictory"; - - if (phasingVc.hasAttribute(ReadBackedPhasingWalker.PHASING_INCONSISTENT_KEY)) { - stats.contradictoryPhaseSitesWithPhaseInconsistency++; - addToOutput += "\tphaseInconsistent"; - useTrioPhase = true; - contradictMessage += " Ignoring " + PHASING_ROD_NAME + " phase due to phase-inconsistency."; - } - else { - contradictMessage += " Maintaining phase from " + PHASING_ROD_NAME + "."; - } - logger.warn(contradictMessage); - } - } - - if (useTrioPhase) { // trio phasing adds PREVIOUSLY UNKNOWN phase information: - Map genotypes = new HashMap(); - genotypes.put(phasingSample, phasedGt); - - phasingVc = VariantContext.modifyGenotypes(phasingVc, genotypes); - result.phasedVc = phasingVc; - } - } - } - } - out.println(prevLoc + "\t" + curLoc + "\t" + trioPhaseStatus + "\t" + curPhasingGt.isPhased() + addToOutput); - - if (diffTrioAndPhasingTracks != null && prevTrioStatus != TrioStatus.MISSING && currentTrioStatus != TrioStatus.MISSING && sampleCurGtInTrio.isPhased() && curPhasingGt.isPhased()) { - AllelePair prevTrioAll = new AllelePair(prevTrioVc.getGenotype(phasingSample)); - AllelePair curTrioAll = new AllelePair(sampleCurGtInTrio); - - AllelePair prevPhasingAll = new AllelePair(prevPhasingGt); - AllelePair curPhasingAll = new AllelePair(curPhasingGt); - - boolean topsMatch = (GenotypePhasingEvaluator.topMatchesTop(prevTrioAll, prevPhasingAll) && GenotypePhasingEvaluator.topMatchesTop(curTrioAll, curPhasingAll)); - boolean bottomsMatch = (GenotypePhasingEvaluator.bottomMatchesBottom(prevTrioAll, prevPhasingAll) && GenotypePhasingEvaluator.bottomMatchesBottom(curTrioAll, curPhasingAll)); - - boolean topMatchesBottom = (GenotypePhasingEvaluator.topMatchesBottom(prevTrioAll, prevPhasingAll) && GenotypePhasingEvaluator.topMatchesBottom(curTrioAll, curPhasingAll)); - boolean bottomMatchesTop = (GenotypePhasingEvaluator.bottomMatchesTop(prevTrioAll, prevPhasingAll) && GenotypePhasingEvaluator.bottomMatchesTop(curTrioAll, curPhasingAll)); - - boolean phasesAgree = ((topsMatch && bottomsMatch) || (topMatchesBottom && bottomMatchesTop)); - - diffTrioAndPhasingTracks.println(prevLoc + "\t" + curLoc + "\t" + trioPhaseStatus + "\t" + phasesAgree); - diffTrioAndPhasingCounts.addComparison(trioPhaseStatus, phasesAgree); - } - } - - prevLoc = curLoc; - prevTrioVc = curTrioVc; - prevTrioStatus = currentTrioStatus; - prevPhasingGt = curPhasingGt; - - return result; - } - - public CompareToTrioPhasingStats reduce(CompareResult addIn, CompareToTrioPhasingStats runningCount) { - if (addIn == null) - addIn = new CompareResult(); - - if (writer != null && addIn.phasedVc != null) - WriteVCF.writeVCF(addIn.phasedVc, writer, logger); - - return runningCount.addIn(addIn.stats); - } - - /** - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(CompareToTrioPhasingStats result) { - System.out.println("Compared " + result.comparedSites + " sites."); - System.out.println("Trio can phase " + result.trioPhaseableSites + " sites."); - System.out.println("Trio and " + PHASING_ROD_NAME + " track can both phase " + result.bothCanPhase + " sites."); - System.out.println("Contradiction between phase inferred from " + TRIO_ROD_NAME + " and phase present in " + PHASING_ROD_NAME + " tracks at " + result.contradictoryPhaseSites + " sites."); - System.out.println("Of those, " + PHASING_ROD_NAME + " track is phase-inconsistent at " + result.contradictoryPhaseSitesWithPhaseInconsistency + " sites."); - - if (diffTrioAndPhasingCounts != null) { - System.out.println(""); - diffTrioAndPhasingCounts.printSummary(System.out); - } - } -} - -class CompareToTrioPhasingStats { - public int comparedSites; - public int trioPhaseableSites; - public int contradictoryPhaseSites; - public int contradictoryPhaseSitesWithPhaseInconsistency; - public int bothCanPhase; - - public CompareToTrioPhasingStats() { - this.comparedSites = 0; - this.trioPhaseableSites = 0; - this.contradictoryPhaseSites = 0; - this.contradictoryPhaseSitesWithPhaseInconsistency = 0; - this.bothCanPhase = 0; - } - - public CompareToTrioPhasingStats addIn(CompareToTrioPhasingStats other) { - this.comparedSites += other.comparedSites; - this.trioPhaseableSites += other.trioPhaseableSites; - this.contradictoryPhaseSites += other.contradictoryPhaseSites; - this.contradictoryPhaseSitesWithPhaseInconsistency += other.contradictoryPhaseSitesWithPhaseInconsistency; - this.bothCanPhase += other.bothCanPhase; - - return this; - } -} - -class CompareResult { - public VariantContext phasedVc; - public CompareToTrioPhasingStats stats; - - public CompareResult() { - this.phasedVc = null; - this.stats = new CompareToTrioPhasingStats(); - } - - public CompareResult(VariantContext phasedVc, CompareToTrioPhasingStats stats) { - this.phasedVc = phasedVc; - this.stats = stats; - } -} - -class CompareTrioAndPhasingTracks { - private Map trioStatusToAgreement; - - public CompareTrioAndPhasingTracks() { - this.trioStatusToAgreement = new HashMap(); - } - - public void addComparison(String trioStatus, boolean agree) { - AgreeDisagreeCounts counts = trioStatusToAgreement.get(trioStatus); - if (counts == null) { - counts = new AgreeDisagreeCounts(); - trioStatusToAgreement.put(trioStatus, counts); - } - - if (agree) - counts.incrementAgree(); - else - counts.incrementDisagree(); - } - - public void printSummary(PrintStream out) { - out.println("--------------------------------------------"); - out.println("Summary of trio vs. phasing tracks' phasing:"); - out.println("--------------------------------------------"); - - int globalAgree = 0; - int globalDisagree = 0; - for (AgreeDisagreeCounts counts : trioStatusToAgreement.values()) { - globalAgree += counts.agree; - globalDisagree += counts.disagree; - } - int globalTotal = globalAgree + globalDisagree; - - out.println("Concordant phase:\t" + percentString(globalAgree, globalTotal)); - out.println("Discordant phase:\t" + percentString(globalDisagree, globalTotal)); - - for (Map.Entry statusCounts : trioStatusToAgreement.entrySet()) { - String status = statusCounts.getKey(); - AgreeDisagreeCounts counts = statusCounts.getValue(); - - out.println(""); - out.println("'" + status + "'" + " Concordant phase:\t" + percentString(counts.agree, counts.total())); - out.println("'" + status + "'" + " Discordant phase:\t" + percentString(counts.disagree, counts.total())); - } - out.println("--------------------------------------------"); - out.println(""); - } - - private static String percentString(int numerator, int denominator) { - int NUM_DECIMAL_PLACES = 1; - String percent = new Formatter().format("%." + NUM_DECIMAL_PLACES + "f", MathUtils.percentage(numerator, denominator)).toString(); - - StringBuilder sb = new StringBuilder(); - sb.append(numerator).append(" (").append(percent).append("%)"); - - return sb.toString(); - } -} - -class AgreeDisagreeCounts { - protected int agree; - protected int disagree; - - public AgreeDisagreeCounts() { - this.agree = 0; - this.disagree = 0; - } - - public void incrementAgree() { - agree++; - } - - public void incrementDisagree() { - disagree++; - } - - public int total() { - return agree + disagree; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CountHetPhasingInIntervalWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CountHetPhasingInIntervalWalker.java deleted file mode 100755 index 5c0af0e41..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CountHetPhasingInIntervalWalker.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.PrintStream; -import java.util.*; - -/** - * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = "variant", type = ReferenceOrderedDatum.class), @RMD(name = CountHetPhasingInIntervalWalker.INTERVALS_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -@ReadFilters({ZeroMappingQualityReadFilter.class}) -// Filter out all reads with zero mapping quality - -public class CountHetPhasingInIntervalWalker extends RodWalker { - private LinkedList rodNames = null; - - private GenomeLoc prevInterval = null; - - private MultiSampleIntervalStats intervalStats = null; - - @Output - protected PrintStream out; - - @Argument(fullName = "perIntervalOut", shortName = "perIntervalOut", doc = "File to which to write per-sample, per-interval phased het statistics", required = false) - protected PrintStream perIntervalOut = null; - - public final static String INTERVALS_ROD_NAME = "intervals"; - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add("variant"); - - intervalStats = new MultiSampleIntervalStats(perIntervalOut); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - int processed = 1; - - List interval = tracker.getGATKFeatureMetaData(INTERVALS_ROD_NAME, true); - if (interval.size() != 1) { - String error = "At " + ref.getLocus() + " : Must provide a track named '"+ INTERVALS_ROD_NAME +"' with exactly ONE interval per locus in -L argument!"; - if (interval.size() < 1) - throw new UserException(error); - else // interval.size() > 1 - logger.warn(error); - } - // Take the FIRST interval covering this locus, and WARN about multiple intervals (above): - GenomeLoc curInterval = interval.get(0).getLocation(); - logger.debug("refLocus: " + ref.getLocus() + "\tcurInterval = " + curInterval); - - boolean isNewInterval = (prevInterval == null || !curInterval.equals(prevInterval)); - if (isNewInterval) - intervalStats.startNewInterval(curInterval); - - boolean requireStartHere = true; // only see each VariantContext once - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - Map sampToGenotypes = vc.getGenotypes(); - for (Map.Entry sampEntry : sampToGenotypes.entrySet()) { - Genotype gt = sampEntry.getValue(); - intervalStats.processHetSiteInInterval(sampEntry.getKey(), gt.isHet(), gt.isPhased()); - } - } - - prevInterval = curInterval; - - return processed; - } - - public Integer reduce(Integer addIn, Integer runningCount) { - if (addIn == null) - addIn = 0; - - return runningCount + addIn; - } - - /** - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(Integer result) { - intervalStats.finalizeStats(); - - System.out.println("Processed " + result + " sites."); - - for (Map.Entry sampleEntry : intervalStats.entrySet()) { - out.println("Sample:\t" + sampleEntry.getKey()); - out.println(sampleEntry.getValue() + "\n"); - } - } - - class MultiSampleIntervalStats { - private Map sampleToStat; - protected int numIntervals; - - private PrintStream perIntervalOut; - private GenomeLoc curInterval; - - public MultiSampleIntervalStats(PrintStream perIntervalOut) { - this.sampleToStat = new HashMap(); - this.numIntervals = 0; - this.perIntervalOut = perIntervalOut; - } - - public void processHetSiteInInterval(String sample, boolean isHet, boolean isPhased) { - SingleSampleIntervalStats sampleStats = sampleToStat.get(sample); - if (sampleStats == null) { - sampleStats = new SingleSampleIntervalStats(); - sampleToStat.put(sample, sampleStats); - } - - sampleStats.updateHetStats(isHet, isPhased); - } - - public void finalizeStats() { - if (curInterval == null) - return; - - for (Map.Entry sampleStatEntry : sampleToStat.entrySet()) { - SingleSampleIntervalStats stats = sampleStatEntry.getValue(); - if (perIntervalOut != null && stats.numHetsInCurrentInterval > 0) { - String sample = sampleStatEntry.getKey(); - perIntervalOut.println(sample + "\t" + curInterval + "\t" + stats.numPhasedInCurrentInterval + "\t" + stats.numHetsInCurrentInterval + "\t" + stats.firstHetIsPhasedInCurrentInterval); - } - stats.finalizeStats(); // now, can reset the counters [after print-out] - } - } - - public void startNewInterval(GenomeLoc curInterval) { - finalizeStats(); - numIntervals++; - this.curInterval = curInterval; - } - - public Set> entrySet() { - return sampleToStat.entrySet(); - } - - private class SingleSampleIntervalStats { - public Map hetStatInIntervalToCount; - public int firstHetIsPhasedCount; - - private int numHetsInCurrentInterval; - private int numPhasedInCurrentInterval; - private boolean firstHetIsPhasedInCurrentInterval; - - public SingleSampleIntervalStats() { - this.hetStatInIntervalToCount = new TreeMap(); // implemented PhasedHetsStat.compareTo() - this.firstHetIsPhasedCount = 0; - - resetCurrentIntervalCounters(); - } - - private void resetCurrentIntervalCounters() { - this.numHetsInCurrentInterval = 0; - this.numPhasedInCurrentInterval = 0; - this.firstHetIsPhasedInCurrentInterval = false; - } - - public void updateHetStats(boolean isHet, boolean isPhased) { - if (isHet) { - numHetsInCurrentInterval++; - - if (isPhased) { - numPhasedInCurrentInterval++; - - if (numHetsInCurrentInterval == 1) - firstHetIsPhasedInCurrentInterval = true; - } - } - } - - public void finalizeStats() { - if (numIntervals == 0) // have not yet seen any intervals - return; - - PhasedHetsStat hetsAndPhased = new PhasedHetsStat(numHetsInCurrentInterval, numPhasedInCurrentInterval); - Integer cnt = hetStatInIntervalToCount.get(hetsAndPhased); - if (cnt == null) - cnt = 0; - hetStatInIntervalToCount.put(hetsAndPhased, cnt + 1); - - if (firstHetIsPhasedInCurrentInterval) - firstHetIsPhasedCount++; - - resetCurrentIntervalCounters(); - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append("# of intervals: " + numIntervals + "\n"); - sb.append("First het is phased: " + firstHetIsPhasedCount + "\n"); - - sb.append("Distribution of number of phased / hets per interval:" + "\n"); - for (Map.Entry hetStatEntry : hetStatInIntervalToCount.entrySet()) - sb.append(hetStatEntry.getKey() + "\t" + hetStatEntry.getValue() + "\n"); - - return sb.toString(); - } - } - } - - class PhasedHetsStat implements Comparable { - public int numHets; - public int numPhased; - - public PhasedHetsStat(int numHets, int numPhased) { - this.numHets = numHets; - this.numPhased = numPhased; - } - - public int compareTo(PhasedHetsStat that) { - if (this.numHets != that.numHets) - return this.numHets - that.numHets; - - return this.numPhased - that.numPhased; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(numPhased + " / " + numHets); - - return sb.toString(); - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java deleted file mode 100755 index ed5406b8b..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java +++ /dev/null @@ -1,367 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - -/** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. - */ -public class PhaseByTransmission extends RodWalker { - @Argument(shortName="f", fullName="familyPattern", required=true, doc="Pattern for the family structure (usage: mom+dad=child)") - public String familyStr = null; - - @Argument(shortName="nofilters", fullName="disableFilters", required=false, doc="Disable filters for sites where the phase can't be determined, where the parental origin of the alleles is ambiguous (i.e. everyone is heterozygous), or Mendelian violations") - public Boolean noFilters = false; - - @Output - protected VCFWriter vcfWriter = null; - - private String SAMPLE_NAME_MOM; - private String SAMPLE_NAME_DAD; - private String SAMPLE_NAME_CHILD; - - private final String ROD_NAME = "variant"; - private final String AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME = "AmbiguousAlleleOrigin"; - private final String INSUFFICIENT_DATA_FILTER_NAME = "InsufficientInformation"; - private final String MENDELIAN_VIOLATION_FILTER_NAME = "MendelianViolation"; - private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; - private final String SOURCE_NAME = "PhaseByTransmission"; - - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; - - /** - * Parse the familial relationship specification, and initialize VCF writer - */ - public void initialize() { - String[] pieces = familyStr.split("[\\+\\=]"); - - SAMPLE_NAME_MOM = pieces[0]; - SAMPLE_NAME_DAD = pieces[1]; - SAMPLE_NAME_CHILD = pieces[2]; - - ArrayList rodNames = new ArrayList(); - rodNames.add(ROD_NAME); - - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - - if (vcfSamples.size() != 3) { - throw new UserException("File to phase by transmission contains more than three samples. This walker only" + - "accepts VCFs with three samples, so that the meaning of the applied filters is" + - "unambiguous."); - } - - if (!vcfSamples.contains(SAMPLE_NAME_MOM) || !vcfSamples.contains(SAMPLE_NAME_DAD) || !vcfSamples.contains(SAMPLE_NAME_CHILD)) { - throw new UserException("One or more of the samples specified in the familyPattern argument is not present" + - "in this file. Please supply a VCF file that contains only three samples: the" + - "mother, the father, and the child"); - } - - Set samples = new HashSet(); - samples.add(SAMPLE_NAME_MOM); - samples.add(SAMPLE_NAME_DAD); - samples.add(SAMPLE_NAME_CHILD); - - Set headerLines = new HashSet(); - headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFilterHeaderLine(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME, "The parental origin of each of the child's allele cannot be determined (ie everyone is heterozygous)")); - headerLines.add(new VCFFilterHeaderLine(INSUFFICIENT_DATA_FILTER_NAME, "The phase of the child's genotype cannot be determined (ie someone is a no-call)")); - headerLines.add(new VCFFilterHeaderLine(MENDELIAN_VIOLATION_FILTER_NAME, "No combination of the parents' alleles can yield the child's genotype (ie a possible Mendelian violation)")); - headerLines.add(new VCFInfoHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); - } - - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); - - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; - - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; - } - - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - - return genotypes; - } - - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; - - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; - } - } - - return matchingAlleles; - } - - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; - - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; - - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); - - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); - } - - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); - - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); - - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true); - - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); - } - } - - ArrayList finalGenotypes = new ArrayList(); - - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); - - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); - - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true); - - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); - - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); - - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true); - - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); - - return finalGenotypes; - } - } - - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - return finalGenotypes; - } - - private VariantContext phaseTrioGenotypes(VariantContext vc) { - Genotype mom = vc.getGenotype(SAMPLE_NAME_MOM); - Genotype dad = vc.getGenotype(SAMPLE_NAME_DAD); - Genotype child = vc.getGenotype(SAMPLE_NAME_CHILD); - - Set filters = new HashSet(); - filters.addAll(vc.getFilters()); - - Map attributes = new HashMap(); - attributes.putAll(vc.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, 0.0); - - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - if (!mom.isCalled() || !dad.isCalled() || !child.isCalled()) { - filters.add(INSUFFICIENT_DATA_FILTER_NAME); - } else { - ArrayList possibleMomGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), mom); - ArrayList possibleDadGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), dad); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), child); - - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMomGenotype = mom; - Genotype bestDadGenotype = dad; - Genotype bestChildGenotype = child; - - double norm = 0.0; - - /* - GATKReport report = new GATKReport(); - report.addTable("TransmissionProbability", "Reports various quantities used to compute transmission probability"); - - GATKReportTable table = report.getTable("TransmissionProbability"); - table.addPrimaryKey("config", false); - table.addColumn("mom", "unknown"); - table.addColumn("momProbability", "unknown"); - table.addColumn("dad", "unknown"); - table.addColumn("dadProbability", "unknown"); - table.addColumn("child", "unknown"); - table.addColumn("childProbability", "unknown"); - table.addColumn("configLikelihood", "unknown"); - */ - - for (Genotype momGenotype : possibleMomGenotypes) { - for (Genotype dadGenotype : possibleDadGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), momGenotype, dadGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(momGenotype, dadGenotype, childGenotype); - norm += prior*configurationLikelihood; - - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMomGenotype = momGenotype; - bestDadGenotype = dadGenotype; - bestChildGenotype = childGenotype; - } - - /* - String config = momGenotype.toString() + dadGenotype.toString() + childGenotype.toString(); - - double[] momLikelihoods = MathUtils.normalizeFromLog10(momGenotype.getLikelihoods().getAsVector()); - double momProbability = momLikelihoods[momGenotype.getType().ordinal() - 1]; - - table.set(config, "mom", momGenotype.toString()); - table.set(config, "momProbability", momProbability); - - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dadGenotype.getLikelihoods().getAsVector()); - double dadProbability = dadLikelihoods[dadGenotype.getType().ordinal() - 1]; - - table.set(config, "dad", dadGenotype.toString()); - table.set(config, "dadProbability", dadProbability); - - double[] childLikelihoods = MathUtils.normalizeFromLog10(childGenotype.getLikelihoods().getAsVector()); - double childProbability = childLikelihoods[childGenotype.getType().ordinal() - 1]; - - table.set(config, "child", childGenotype.toString()); - table.set(config, "childProbability", childProbability); - - table.set(config, "configLikelihood", prior*configurationLikelihood); - */ - } - } - } - - /* - if (!mom.sameGenotype(bestMomGenotype) || !dad.sameGenotype(bestDadGenotype) || !child.sameGenotype(bestChildGenotype)) { - System.out.println("Found a better genotype configuration!"); - table.write(System.out); - System.out.println(mom.toBriefString() + " " + dad.toBriefString() + " " + child.toBriefString()); - System.out.println(bestMomGenotype.toBriefString() + " " + bestDadGenotype.toBriefString() + " " + bestChildGenotype.toBriefString()); - System.out.println(bestPrior*bestConfigurationLikelihood / norm); - System.out.println(""); - } - */ - - if (isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), bestMomGenotype, bestDadGenotype, bestChildGenotype)) { - filters.add(MENDELIAN_VIOLATION_FILTER_NAME); - } else if (bestMomGenotype.isHet() && bestDadGenotype.isHet() && bestChildGenotype.isHet()) { - filters.add(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME); - } else { - finalGenotypes = getPhasedGenotypes(bestMomGenotype, bestDadGenotype, bestChildGenotype); - - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - } - } - - return new VariantContext(SOURCE_NAME, vc.getChr(), vc.getStart(), vc.getStart(), vc.getAlleles(), finalGenotypes, vc.getNegLog10PError(), noFilters ? vc.getFilters() : filters, attributes); - } - - /** - * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype - * - * @param tracker the reference meta-data tracker - * @param ref the reference context - * @param context the alignment context - * @return null - */ - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker != null) { - Collection vcs = tracker.getVariantContexts(ref, ROD_NAME, null, context.getLocation(), true, true); - - for (VariantContext vc : vcs) { - vcfWriter.add(phaseTrioGenotypes(vc), ref.getBase()); - } - } - - return null; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - @Override - public Integer reduceInit() { - return null; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return null; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhasingEval.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhasingEval.java deleted file mode 100644 index 4a1b51d4c..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhasingEval.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.io.PrintStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value = {}) -public class PhasingEval extends RodWalker { - - @Output(doc = "File to which results should be written", required = true) - protected PrintStream out; - - @Argument(doc = "sample to emit", required = false) - protected String sample = null; - - @Argument(doc = "Analysis to perform", required = true) - protected Analysis analysis; - - public enum Analysis { - PHASING_BY_AC - } - - private class PhasingByAC { - int myAC = 0; - int myAN = 0; - int nHets = 0; - int nHetsPhased = 0; - - public PhasingByAC(int myAC, int myAN) { - this.myAC = myAC; - this.myAN = myAN; - } - } - - List phasingByACs = new ArrayList(); - - public void initialize() { - Set samples = SampleUtils.getSampleList(VCFUtils.getVCFHeadersFromRods(getToolkit(), null)); - int AN = 2 * samples.size(); - for (int i = 0; i <= AN; i++) { - phasingByACs.add(new PhasingByAC(i, AN)); - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) // RodWalkers can make funky map calls - return 0; - - Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); - for (VariantContext vc : vcs) { - if (sample != null) - vc = vc.subContextFromGenotypes(vc.getGenotype(sample)); - - if (analysis == Analysis.PHASING_BY_AC) { - int homref = vc.getHomRefCount(); - int homalt = vc.getHomVarCount(); - int het = vc.getHetCount(); - int ac = 2 * homalt + het; - - //int an = 2 * (homref + homalt + het); - - PhasingByAC data = phasingByACs.get(ac); - data.nHets += het > 0 ? 1 : 0; - data.nHetsPhased += isPhysicallyPhased(vc.getGenotypes().values()) ? 1 : 0; - } - } - - return 1; - } - - private boolean isPhysicallyPhased(Collection genotypes) { - for (Genotype g : genotypes) { - if (g.isHet() && g.hasAttribute(ReadBackedPhasingWalker.PQ_KEY)) - return true; - } - - return false; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - if (analysis == Analysis.PHASING_BY_AC) { - out.println(Utils.join("\t", Arrays.asList("ac", "an", "nhets", "nhetphased"))); - for (PhasingByAC pac : phasingByACs) { - out.printf("%d\t%d\t%d\t%d%n", pac.myAC, pac.myAN, pac.nHets, pac.nHetsPhased); - } - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/qc/AnalyzeMemoryConsumption.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/qc/AnalyzeMemoryConsumption.java deleted file mode 100644 index c90fa0699..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/qc/AnalyzeMemoryConsumption.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.instrumentation.Sizeof; - -import java.io.PrintStream; -import java.lang.management.ManagementFactory; -import java.lang.management.MemoryMXBean; -import java.lang.management.MemoryUsage; - -/** - * Analyzes the memory consumption required by the input data set as a percentage of the total heap consumed. - * Uses the Sizeof operator, which means that some supplemental data must be added to the command line. - * - * add -javaagent:$STING_HOME/dist/StingUtils.jar as a command-line - * JVM argument. - * - * For up-to-the-minute documentation, see the org.broadinstitute.sting.utils.instrumentation.Sizeof class. - */ -public class AnalyzeMemoryConsumption extends LocusWalker { - @Output(doc="Write output to this file, or /dev/stdout if unspecified.") - private PrintStream out; - - @Argument(doc="How frequently should we emit heap usage data",required=false) - private int frequency = 1000; - - private MemoryMXBean monitor; - - public void initialize() { - monitor = ManagementFactory.getMemoryMXBean(); - out.println("contig\tlocus\tref.bytes\tref.pcnt.max\treads.count\treads.bytes\treads.pcnt.max\tRODs.bytes\tRODs.pcnt.max\tHeap.used.bytes\tHeap.used.pcnt.max\tMax.heap"); - } - - public LocusContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext reads) { - return new LocusContext(ref,reads,tracker); - } - - /** - * - */ - public Long reduceInit() { - return 0L; - } - - /** - * - */ - public Long reduce(LocusContext locusContext, Long sum) { - sum++; - - if(sum % frequency == 0) { - long refSizeInBytes = Sizeof.getObjectGraphSize(locusContext.reference); - long numReads = locusContext.alignedReads.size(); - long readsSizeInBytes = Sizeof.getObjectGraphSize(locusContext.alignedReads); - long trackerSizeInBytes = Sizeof.getObjectGraphSize(locusContext.referenceOrderedData); - - MemoryUsage memoryUsage = monitor.getHeapMemoryUsage(); - long memoryUsed = memoryUsage.getUsed(); - long maxMemory = memoryUsage.getMax(); - - out.printf("%s\t%s\t%d\t%.3f\t%d\t%d\t%.3f\t%d\t%.3f\t%d\t%.3f\t%d%n", - locusContext.reference.getLocus().getContig(), - locusContext.reference.getLocus().getStart(), - refSizeInBytes,refSizeInBytes*100.0/maxMemory, - numReads,readsSizeInBytes,readsSizeInBytes*100.0/maxMemory, - trackerSizeInBytes,trackerSizeInBytes*100.0/maxMemory, - memoryUsed,memoryUsed*100.0/maxMemory, - maxMemory); - } - - return sum; - } - -} - -/** - * Allows the user to easily pass data specific to a locus from map to reduce. - */ -class LocusContext { - public final ReferenceContext reference; - public final AlignmentContext alignedReads; - public final RefMetaDataTracker referenceOrderedData; - - public LocusContext(final ReferenceContext reference, final AlignmentContext alignedReads, final RefMetaDataTracker referenceOrderedData) { - this.reference = reference; - this.alignedReads = alignedReads; - this.referenceOrderedData = referenceOrderedData; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java deleted file mode 100755 index 045d442a9..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java +++ /dev/null @@ -1,229 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.validation; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; - -import java.io.PrintStream; -import java.util.LinkedList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. - */ -@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class), -@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)}) -public class PickSequenomProbes2 extends RodWalker { - - @Output - PrintStream out; - - GenomeLoc prevInterval; - GenomeLoc allelePos; - String probeName; - StringBuilder sequence; - boolean sequenceInvalid; - List invReason; - - public Integer reduceInit() { - prevInterval = null; - sequence = null; - sequenceInvalid = false; - probeName = null; - invReason = null; - return 0; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; } - - GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation(); - if ( prevInterval == null || ! interval.equals(prevInterval) ) { - // we're in a new interval, we should: - // 1) print out previous data - // 2) reset internal data - // 3) instantiate traversal of this interval - - // step 1: - if ( prevInterval != null ) { - // there was a previous interval - validateSequence(); // ensure the sequence in the region is valid - lowerRepeats(); // change repeats in sequence to lower case - print(); // print out the fasta sequence - } - - // step 2: - prevInterval = interval; - allelePos = null; - sequence = new StringBuilder(); - sequenceInvalid = false; - invReason = new LinkedList(); - logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues())); - probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1); - } - - // step 3 (or 1 if not new): - // build up the sequence - - VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus()); - VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus()); - - if ( mask == null && validate == null ) { - sequence.append(Character.toUpperCase((char) ref.getBase())); - } else if ( validate != null ) { - // doesn't matter if there's a mask here too -- this is what we want to validate - sequence.append('['); - sequence.append(validate.getAlternateAllele(0).toString()); - sequence.append('/'); - sequence.append(validate.getReference().toString()); - sequence.append(']'); - allelePos = ref.getLocus(); - } else /* (mask != null && validate == null ) */ { - if ( ! mask.isSNP() && ! mask.isFiltered() && ! mask.isMonomorphic() ) { - logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); - logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); - sequenceInvalid = true; - invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION"); - //sequence.append((char) ref.getBase()); - sequence.append(mask.isInsertion() ? 'I' : 'D'); - } else if ( ! mask.isFiltered() && ! mask.isMonomorphic() ){ - logger.debug("SNP in mask found at " + ref.getLocus().toString()); - sequence.append((char) BaseUtils.N); - } else if ( mask.isSNP() ) { - logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic"); - } - } - - return 1; - } - - public Integer reduce(Integer i, Integer j) { - return 0; - } - - public void validateSequence() { - // code for ensuring primer sequence is valid goes here - - // validate that there are no masked sites near to the variant site - String seq = sequence.toString(); - int start = seq.indexOf('[') - 4; - int end = seq.indexOf(']') + 5; - - if ( start < 50 ) { - logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed."); - sequenceInvalid = true; - invReason.add("START_TOO_CLOSE"); - } else if ( end > seq.length() - 50 ) { - logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. "); - sequenceInvalid = true; - invReason.add("END_TOO_CLOSE"); - } else { - boolean maskNearVariantSite = false; - for ( int i = start; i < end; i++ ) { - maskNearVariantSite |= (seq.charAt(i) == 'N'); - } - - if ( maskNearVariantSite ) { - logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); - sequenceInvalid = true; - invReason.add("VARIANT_TOO_NEAR_PROBE"); - } - } - - if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { - logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap."); - sequenceInvalid = true; - invReason.add("MULTIPLE_PROBES"); - } - - if ( seq.indexOf("[") < 0 ) { - logger.warn("No variants in region were found. This site will not be designed."); - sequenceInvalid = true; - invReason.add("NO_VARIANTS_FOUND"); - } - } - - public void lowerRepeats() { - // convert to lower case low-complexity repeats, e.g. tandem k-mers - final int K_LIM = 8; - String seq = sequence.toString(); - StringBuilder newSequence = new StringBuilder(); - int start_pos = 0; - while( start_pos < seq.length() ) { - boolean broke = false; - for ( int length = K_LIM; length > 1; length -- ) { - //logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length())); - if ( start_pos + 2*length> seq.length() ) { - continue; - } - if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) { - newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase()); - newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase()); - start_pos += 2*length; - broke = true; - break; - } - } - - if ( ! broke ) { - newSequence.append(seq.substring(start_pos,start_pos+1)); - start_pos++; - } - - } - - if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { - return; - } - - sequence = newSequence; - } - - public boolean equalsIgnoreNs(String one, String two) { - if ( one.length() != two.length() ) { return false; } - for ( int idx = 0; idx < one.length(); idx++ ) { - if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) { - if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) { - return false; - } - } - } - - //logger.debug(String.format("one: %s two: %s",one,two)); - - return true; - } - - public void print() { - String valid; - if ( sequenceInvalid ) { - valid = ""; - while ( invReason.size() > 0 ) { - String reason = invReason.get(0); - invReason.remove(reason); - int num = 1; - while ( invReason.contains(reason) ) { - num++; - invReason.remove(reason); - } - valid += String.format("%s=%d,",reason,num); - } - } else { - valid = "Valid"; - } - - String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - out.printf("%s\t%s\t%s\t%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java deleted file mode 100755 index b6af87637..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java +++ /dev/null @@ -1,168 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; -import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; - -import java.util.Collection; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Nov 22, 2010 - * Time: 12:22:08 PM - * To change this template use File | Settings | File Templates. - */ -@Analysis(name = "PrivatePermutations", description = "Number of additional mutations from each new sample; random permutations") -public class PrivatePermutations extends VariantEvaluator { - private final int NUM_PERMUTATIONS = 50; - private final double LOW_GQ_PCT = 0.80; - private final double LOW_GQ_THRSH = 25.0; - private final boolean IGNORE_FILTER_FIELD = true; - private boolean initialized = false; - private long skipped = 0l; - - @DataPoint(description="Number of additional mutations from each new sample; random permutations") - AdditionalBySample permuteCounts = null; - - String[][] permutations; - - public boolean enabled() { - return true; - } - - public int getComparisonOrder() { - return 2; - } - - public String getName() { - return "PrivatePermutations"; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval != null && ! initialized ) { - //this.veWalker.getLogger().warn("Initializing..."); - initialize(eval); - initialized = true; - } - - if ( isGood(eval) ) { - if ( comp != null && ! comp.isFiltered() ) { - return null; - } - - int order_offset = 0; - for ( String[] ordering : permutations ) { - int sample_offset = 0; - for ( String sample : ordering ) { - if ( eval.getGenotype(sample).isHet() || eval.getGenotype(sample).isHomVar() ) { - break; - } - sample_offset ++; - } - - permuteCounts.additionalValue[order_offset][sample_offset]++; - order_offset++; - } - } else { - skipped++; - } - - return null; - } - - private boolean isGood(VariantContext vc) { - if ( vc == null || (vc.isFiltered() && ! IGNORE_FILTER_FIELD) || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites - return false; - } else { - Collection gtypes = vc.getGenotypes().values(); - int ngood = 0; - int nbad = 0; - for ( Genotype g : gtypes) { - if ( g.getPhredScaledQual() >= LOW_GQ_THRSH ) { - ngood ++; - } else { - nbad ++; - } - } - - return ( (0.0+ngood)/(0.0+ngood + nbad) >= LOW_GQ_PCT ); - } - } - - public PrivatePermutations(VariantEvalWalker parent) { - //super(parent); - } - - public void initialize(VariantContext vc) { - Set permuteSamples = vc.getSampleNames(); - permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()]; - //veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size())); - int offset = 0; - for ( String s : permuteSamples ) { - permutations[0][offset] = s; - offset ++; - } - - for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) { - permutations[p] = permutations[0].clone(); - for ( int o = 0; o < permutations[p].length; o ++ ) { - int r = (int) Math.floor(Math.random()*(o+1)); - String swap = permutations[p][r]; - permutations[p][r] = permutations[p][o]; - permutations[p][o] = swap; - } - } - - permuteCounts = new AdditionalBySample(); - permuteCounts.additionalValue = new int[NUM_PERMUTATIONS][permuteSamples.size()]; - } - - class AdditionalBySample implements TableType { - int[][] additionalValue; - //String[][] permutationNames; - String[] rowKeys = null; - String[] colKeys = null; - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - rowKeys = new String[additionalValue.length]; - for ( int i = 0; i < additionalValue.length; i ++ ) { - rowKeys[i] = String.format("%s%d","P",i); - } - } - - - return rowKeys; - } - - public String getCell(int x, int y) { - return String.format("%d",additionalValue[x][y]); - } - - public String getName() { return "Marginal Number of Mutations"; } - - public Object[] getColumnKeys() { - if ( colKeys == null ) { - colKeys = new String[additionalValue[0].length]; - for ( int i = 0; i < additionalValue[0].length; i ++ ) { - colKeys[i] = String.format("%s%d","S",i); - } - } - - return colKeys; - } - } - - public void finalizeEvaluation() { - //veWalker.getLogger().info(String.format("Skipped: %d",skipped)); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java deleted file mode 100755 index b8937d5dd..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java +++ /dev/null @@ -1,177 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; - -import com.google.common.collect.ArrayListMultimap; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - -/** - * @doc Merges N callsets that have been made on the same set of samples, and averages specific annotations. - * @Author chartl - * - */ -public class BootstrapCallsMerger extends RodWalker implements TreeReducible{ - - @Output - VCFWriter vcfWriter = null; - - // todo -- remove this, can be done just by looking at the type and iterating over ANNOTS_TO_AVG - // todo -- multi-allelic sites (e.g. what happens here:) - // todo -- Set 1: A G,T AC=2,4 ?? - // todo -- Set 1: A G AC=2 Set 2: A T AC=4 - // todo -- fix above behavior - final static private Set INTEGER_ANNOTS_CAN_MEAN = new HashSet(Arrays.asList("AC","AN")); - final static private Set ANNOTS_TO_AVG = new HashSet(Arrays.asList( - "QD","SB","HaplotypeScore","Dels","MQ","MQ0","sumGLByD","AC","AF","AN")); - - public void initialize() { - // grab the samples - Set samples = new HashSet(); - // setup the header fields - // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones - Set hInfo = new HashSet(); - for ( Map.Entry headers : VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), "bootstrap").entrySet() ) { - samples.addAll(headers.getValue().getGenotypeSamples()); - for ( VCFHeaderLine line : headers.getValue().getMetaData() ) { - logger.debug(line); - VCFHeaderLine altered = alterHeaderLine(line); - if ( VariantAnnotator.isUniqueHeaderLine(altered, hInfo) ) - hInfo.add(altered); - } - } - hInfo.add(new VCFInfoHeaderLine("NB",1,VCFHeaderLineType.Integer,"Number of bootsrap sets site was seen in")); - hInfo.add(new VCFFormatHeaderLine("BC",4,VCFHeaderLineType.Integer,"Genotype counts across bootsraps: ref,het,var,nocall")); - HashSet rodName = new HashSet(); - rodName.add("variant"); - VCFHeader vcfHeader = new VCFHeader(hInfo, samples); - vcfWriter.writeHeader(vcfHeader); - } - - /** - * Note: integer annotations will need to become floats, others will not - * (e.g. HRun won't change, but counts will) - * @param line - * @return line with type changed - */ - private VCFHeaderLine alterHeaderLine(VCFHeaderLine line) { - if ( line instanceof VCFInfoHeaderLine ) { - if(INTEGER_ANNOTS_CAN_MEAN.contains(((VCFInfoHeaderLine) line).getName())) { - return new VCFInfoHeaderLine(((VCFInfoHeaderLine) line).getName(), - ((VCFInfoHeaderLine) line).getCount(), - VCFHeaderLineType.Float, - ((VCFInfoHeaderLine) line).getDescription()); - } - } - return line; - } - - public VCFWriter reduceInit() { - return vcfWriter; - } - - public VCHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { - if ( tracker == null ) { return null; } - Collection bootstraps = tracker.getVariantContextsByPrefix(ref,Arrays.asList("bootstrap"),null,ref.getLocus(),true,false); - int num_bootstraps = bootstraps.size(); - if ( num_bootstraps == 0 ) { return null; } - Map avgInfo = new HashMap(ANNOTS_TO_AVG.size()); - Map genotypeCountsBySample = new HashMap(); - for ( VariantContext vc : bootstraps ) { - // update genotype counts - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - if ( ! genotypeCountsBySample.containsKey(genotype.getKey())) { - genotypeCountsBySample.put(genotype.getKey(),new Integer[]{0,0,0,0}); - } - genotypeCountsBySample.get(genotype.getKey())[genotype2offset(genotype.getValue())]++; - } - // update info field annotations - for ( String anno : ANNOTS_TO_AVG ) { - if ( ! avgInfo.containsKey(anno) ) { - avgInfo.put(anno,0.0); - } - Object value = vc.getAttribute(anno); - if ( value instanceof Number ) { - //logger.debug(value); - avgInfo.put(anno,avgInfo.get(anno) + ((Number)value).doubleValue()/num_bootstraps); - } - if ( value instanceof String ) { - //logger.debug("string: "+value.toString()); - avgInfo.put(anno,avgInfo.get(anno) + Double.valueOf((String)value)/num_bootstraps); - } - } - } - VariantContext first = bootstraps.iterator().next(); - Map finalInfo = new HashMap(first.getAttributes().size()); - for ( Map.Entry attrib : first.getAttributes().entrySet() ) { - if ( ANNOTS_TO_AVG.contains(attrib.getKey()) ) { - finalInfo.put(attrib.getKey(),avgInfo.get(attrib.getKey())); - } else { - finalInfo.put(attrib.getKey(),attrib.getValue()); - } - } - Map finalGenotypes = new HashMap(first.getSampleNames().size()); - for ( Map.Entry g : first.getGenotypes().entrySet() ) { - Map att = new HashMap(g.getValue().getAttributes()); - att.put("BC",countsToString(genotypeCountsBySample.get(g.getKey()))); - //logger.debug(g.getValue()); - finalGenotypes.put(g.getKey(),Genotype.modifyAttributes(g.getValue(),att)); - //logger.debug("final:"); - //logger.debug(finalGenotypes.get(g.getKey())); - } - - finalInfo.put("NB",String.format("%d",num_bootstraps)); - - VariantContext attributeModified = VariantContext.modifyAttributes(first,finalInfo); - logger.debug(attributeModified.hasGenotypes() ? "attributes have genotypes" : "VERY BAD"); - VariantContext genotypeModified = VariantContext.modifyGenotypes(attributeModified,finalGenotypes); - logger.debug(genotypeModified.hasGenotypes() ? "modified genotypes have genotypes" : "NOT SO BAD"); - - return new VCHolder(genotypeModified,ref.getBase()); - - //return new VCHolder(VariantContext.modifyGenotypes(VariantContext.modifyAttributes(first, finalInfo), finalGenotypes), - // ref.getBase()); - } - - private static String countsToString(Integer[] c) { - return String.format("%d,%d,%d,%d",c[0],c[1],c[2],c[3]); - } - - public VCFWriter treeReduce(VCFWriter l, VCFWriter r) { - return l; - } - - public VCFWriter reduce(VCHolder h, VCFWriter w) { - if ( h != null ) { - w.add(h.v,h.b); - } - - return w; - } - - private static int genotype2offset(Genotype g) { - if ( g.isHomRef() ) { return 0; } - if ( g.isHet() ) { return 1; } - if ( g.isHomVar() ) { return 2; } - return 3; - } - - class VCHolder { - public VariantContext v; - public Byte b; - public VCHolder(VariantContext v, Byte b) { - this.v = v; - this.b = b; - } - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java deleted file mode 100755 index 933fc629d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java +++ /dev/null @@ -1,197 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.SampleUtils; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Oct 16, 2010 - * Time: 8:20:41 PM - * To change this template use File | Settings | File Templates. - */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type= VariantContext.class)) -public class FixRefBases extends RodWalker { - @Output(doc="output file to write to",required=true) - VCFWriter out; - @Hidden - @Argument(doc="",fullName="bypassException",required=false) - boolean bypass = false; - - public void initialize() { - if ( ! bypass ) { - throw new ReviewedStingException("This walker is currently broken. This exception is being thrown because you will not get out what you think you will."); - } - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); - headerLines.add(new VCFHeaderLine("source", "FixRefBases")); - headerLines.add(new VCFInfoHeaderLine("FRI",1,VCFHeaderLineType.String,"The Fix-Ref-Info (if present, either \"Flipped\" or \"Fixed\")")); - out.writeHeader(new VCFHeader(headerLines,vcfSamples)); - } - - public Integer reduceInit() { - return 0; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker != null && tracker.hasROD("variant") ) { - VariantContext vc = null; - try { - vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true); - } catch ( ReviewedStingException e ) { - logger.warn("Multiple variants found, catching exception ",e); - return 0; - } - VariantContext newContext = null; - if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) { - if ( basesAreFlipped(vc,ref) ) { - logger.warn(String.format("Variant context at %s has ref and alt bases flipped according to reference",ref.getLocus().toString())); - newContext = flipBases(vc,ref); - } else { - HashSet newAlleles = new HashSet(vc.getAlternateAlleles()); - Allele refAllele = Allele.create(ref.getBase(),true); - newAlleles.add(refAllele); - HashMap newAttributes = new HashMap(vc.getAttributes()); - newAttributes.put("FRI",String.format("Fixed%s-%s",vc.getReference().toString(),refAllele)); - newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), - ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele), - vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - vc.isFiltered() ? vc.getFilters() : null, newAttributes); - } - - if ( ! newContext.hasAttribute("FRI") ) { - throw new StingException("FRI for fixed base not propagated. vc="+vc.toString()); - } - out.add(newContext,ref.getBase()); - return 1; - - } else { - out.add(vc,ref.getBase()); - } - } - - return 0; - } - - public Integer reduce(Integer map, Integer reduce) { - return map + reduce; - } - - public void onTraversalDone(Integer fReduce) { - logger.info(String.format("Fixed %d records",fReduce)); - } - - private boolean basesAreFlipped(VariantContext vc, ReferenceContext reference) { - for ( Allele a : vc.getAlternateAlleles() ) { - if ( a.getBases().length == 1 && a.getBases()[0] == reference.getBase() ) { - return true; - } - } - return false; - } - - private VariantContext flipBases(VariantContext vc, ReferenceContext ref) { - logger.info(String.format("Flipping bases at variant position %s:%d",vc.getChr(),vc.getStart())); - HashSet newAlleles = new HashSet(vc.getAlleles().size()); - newAlleles.add(Allele.create(ref.getBase(),true)); - newAlleles.add(Allele.create(vc.getReference().getBases()[0],false)); - Map attribs = new HashMap(vc.getAttributes()); - attribs.put("FRI",String.format("Flipped%s-%s",vc.getReference().toString(),Allele.create(ref.getBase(),true).toString())); - for ( Allele a : vc.getAlternateAlleles() ) { - if ( a.getBases()[0] != ref.getBase() ) { - newAlleles.add(a); - } - } - - VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), - ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles), - vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - vc.isFiltered() ? vc.getFilters() : null, attribs); - - VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false); - VariantContext.modifyAttributes(newVC,attribs); - return newVC; - } - - private Map fixGenotypes(Map old, Allele newRef) { - HashMap newGTs = new HashMap(old.size()); - for ( Map.Entry e : old.entrySet() ) { - newGTs.put(e.getKey(),fixGenotype(e.getValue(),newRef)); - } - - return newGTs; - } - - private Genotype fixGenotype(Genotype g, Allele newRef) { - List newAlleles = new ArrayList(g.getAlleles().size()); - for ( Allele a : g.getAlleles() ) { - if ( a.isReference() ) { - newAlleles.add(newRef); - } else { - newAlleles.add(a); - } - } - return new Genotype(g.getSampleName(), - newAlleles,g.hasNegLog10PError() ? g.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - g.getAttributes().keySet(), g.getAttributes(),g.isPhased()); - } - - private Map flipGenotypes(Map old, Set newAlleles) { - HashMap newGTs = new HashMap(old.size()); - for ( Map.Entry e : old.entrySet() ) { - newGTs.put(e.getKey(),flipGenotype(e.getValue(),newAlleles)); - } - - return newGTs; - } - - private Genotype flipGenotype(Genotype old, Set newAlleles) { - Allele ref = null; - for ( Allele a : newAlleles ) { - if ( a.isReference() ) { - ref = a; - } - } - if ( ref == null ) { - throw new StingException("No reference allele in variant context with which to flip genotype alleles"); - } - - List newGTAlleles = new ArrayList(old.getAlleles().size()); - - for ( Allele a : old.getAlleles() ) { - if ( ! a.isNoCall() && a.getBases()[0] == ref.getBases()[0] ) { - newGTAlleles.add(ref); - } else { - if ( a.isReference() ) { - newGTAlleles.add(Allele.create(a.getBases(),false)); - } else { - newGTAlleles.add(a); - } - } - } - - return new Genotype(old.getSampleName(), - newGTAlleles,old.hasNegLog10PError() ? old.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - old.getAttributes().keySet(), old.getAttributes(),old.isPhased()); - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/examples/SampleXmlMarshaller.java b/java/src/org/broadinstitute/sting/playground/examples/SampleXmlMarshaller.java deleted file mode 100755 index c5f4c7efb..000000000 --- a/java/src/org/broadinstitute/sting/playground/examples/SampleXmlMarshaller.java +++ /dev/null @@ -1,127 +0,0 @@ -package org.broadinstitute.sting.playground.examples; - -import org.apache.log4j.BasicConfigurator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.simpleframework.xml.Element; -import org.simpleframework.xml.Root; -import org.simpleframework.xml.Serializer; -import org.simpleframework.xml.stream.Format; -import org.simpleframework.xml.stream.HyphenStyle; -import org.simpleframework.xml.core.Persister; - -import java.io.File; - -/** - * Created by IntelliJ IDEA. - * User: hanna - * Date: Apr 7, 2009 - * Time: 2:16:43 PM - * To change this template use File | Settings | File Templates. - */ - -@Root // this is a root output object for simpleXML -public class SampleXmlMarshaller { - public enum RMDType { HapMap }; - - @Element // simpleXML tag to say make it an element, you could also specifiy Attribute - private RMDType type; - - @Element - private String fileVersion; - - @Element - private String fileName; - - public SampleXmlMarshaller() { - } - - public SampleXmlMarshaller( RMDType type, String fileName, String fileVersion ) { - setType( type ); - setFileName( fileName ); - setFileVersion( fileVersion ); - } - - public RMDType getType() { - return type; - } - - public void setType( RMDType type ) { - this.type = type; - } - - public String getFileVersion() { - return fileVersion; - } - - public void setFileVersion( String fileVersion ) { - this.fileVersion = fileVersion; - } - - public String getFileName() { - return fileName; - } - - public void setFileName( String fileName ) { - this.fileName = fileName; - } - - public String toString() { - return String.format("Type = %s, Name = %s, Version = %s%n", type, fileName, fileVersion ); - } - - public boolean equals(SampleXmlMarshaller other) { - if (other == null) { return false; } - if (other.getType() == this.getType() && - other.getFileVersion().equals(this.getFileVersion()) && - other.getFileName().equals(this.getFileName())) { - return true; - } - return false; - } - - public static void main( String argv[] ) { - if (argv.length != 1) { - System.err.println("You must specify a filename for the XML output."); - System.err.println("\nUsage: SampleXmlMarshaller outputfile\n"); - } - // our SampleXmlMarshallers - SampleXmlMarshaller startWith = new SampleXmlMarshaller(RMDType.HapMap, "testFile.xml", "0.1"); - SampleXmlMarshaller endWith = null; - - // where to read and write to - String writeTo = argv[0]; - - BasicConfigurator.configure(); - - marshal(startWith, writeTo); - endWith = unmarshal(writeTo); - - if (startWith.equals(endWith)) { - System.out.println("they're equal, check " + writeTo.toString() + " for the output"); - } else { - System.out.println("they're NOT equal, check " + writeTo.toString() + " for the output. Something must of gone wrong"); - } - - } - - public static void marshal(SampleXmlMarshaller sample, String filename) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - File out = new File(filename); - try { - serializer.write(sample, out); - } catch (Exception e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } - } - - public static SampleXmlMarshaller unmarshal(String filename) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - File source = new File(filename); - try { - SampleXmlMarshaller example = serializer.read(SampleXmlMarshaller.class, source); - return example; - } catch (Exception e) { - throw new ReviewedStingException("Failed to marshal the data to file " + filename,e); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java deleted file mode 100644 index 54645aef4..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.features.maf; - -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.IOException; -import java.util.*; -import java.lang.reflect.Field; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jan 24, 2011 - * Time: 12:04:10 PM - * To change this template use File | Settings | File Templates. - */ -public class MafCodec implements FeatureCodec { - private final static Logger log = Logger.getLogger(MafCodec.class); - - private int expectedTokenCount = -1; - - - private Column BUILD_COL = new Column(new String[]{"NCBI_Build","build"},true); - private Column CHR_COL = new Column(new String[] {"Chromosome","chr"},true); - private Column START_COL = new Column(new String[] {"Start_position","start"},true); - private Column END_COL = new Column(new String[]{"End_position","end"},true); - private Column REF_ALLELE_COL = new Column(new String[] {"Reference_Allele","ref_allele"},true); - private Column TUMOR_ALLELE1_COL = new Column(new String[] {"Tumor_Seq_Allele1","tum_allele1"},true); - private Column TUMOR_ALLELE2_COL = new Column(new String[] {"Tumor_Seq_Allele2","tum_allele2"},true); - private Column TUMOR_SAMPLE_COL = new Column(new String[] {"Tumor_Sample_Barcode","tumor_barcode"},true); - private Column NORMAL_SAMPLE_COL = new Column(new String[]{"Matched_Norm_Sample_Barcode","normal_barcode"},true); - // optional fields (absent from maf lite): - private Column VARTYPE_COL = new Column(new String[]{"Variant_Type","classification"},false); - private Column STRAND_COL = new Column(new String[]{"Strand","strand"},false); - private Column HUGO_GENE_COL = new Column(new String[]{"Hugo_Symbol","gene"},false); - private Column VARCLASS_COL = new Column(new String[]{"Variant_Classification","type"},false); - - - public enum MAF_TYPE { - UNKNOWN,LITE, ANNOTATED - } - - private static String INS ="INS"; - private static String DEL ="DEL"; - private static String SNP ="SNP"; - private static String MNP ="MNP"; - - private MAF_TYPE mafType=MAF_TYPE.UNKNOWN; - - private List allColumns = null; /// filled dynamically by constructor through introspection. Slow but less typing. - - private boolean tooManyColsWarned = false; - private boolean tooFewColsWarned = false; - - public MafCodec() { - allColumns = new ArrayList(30); - Field[] fields = this.getClass().getDeclaredFields(); - try { - for ( Field f : fields ) { - if ( f.get(this) instanceof Column ) { - allColumns.add((Column)f.get(this)); - } - } - } catch (IllegalAccessException e) { - throw new StingException("Error in MAFCodec when trying to introspect itself, this is probably a BUG",e); - } - } - - - /** - * Decode a line to obtain just its FeatureLoc for indexing -- contig, start, and stop. - * This method will NOT fill in the additional information available in the maf file - * @param line the input line to decode - * @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decodeLoc(String line) { - return reallyDecode(line,false); - } - - - /** - * Fully decode a line, will try extracting as much additional/annotation information from the maf file as it can. - * @param line the input line to decode - * @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decode(String line) { - return reallyDecode(line,true); - } - - /** Decodes a maf line. If extra is false, will decode only location and return; - * if extra is true, then extracts everything it can (samples, annotations, etc) - * @param line - * @param extra - * @return - */ - public Feature reallyDecode(String line, boolean extra) { - - // ignore commented-out lines - if (line.startsWith("#")) return null; - - // split the line - String[] tokens = line.split("\\t",-1); - - if ( expectedTokenCount == -1 ) { // do this only when we receive the first line and do not know the number of columns yet - // we have not seen a single line yet, let's initialize the number of fields from the first line: - expectedTokenCount = tokens.length; - log.info("MAF: line has "+expectedTokenCount+" fields (columns)"); - if ( expectedTokenCount == 9 ) { - mafType = MAF_TYPE.LITE; - log.info("MAF file appears to be MAF Lite"); - } else { - if ( expectedTokenCount >= 63 ) { - mafType = MAF_TYPE.ANNOTATED; - log.info("MAF file appears to be MAF-Annotated"); - } else { - log.info("MAF file has "+expectedTokenCount +" columns in first line, unknown file type"); - } - } - if ( line.contains("Chromosome") && line.contains("Start") && line.contains("Build") || - line.contains("build") && line.contains("start") && line.contains("ref_allele") ) { - // a naive way to detect the line with column names - - setColumnsFromHeader(tokens); - log.info("MAF file contains header, all required columns found!"); - return null; - } else { - switch( mafType ) { - case UNKNOWN: throw new UserException.MalformedFile("Can not guess type of the MAF file from number of columns and there is no header"); - case LITE: setMafLiteCols(); break; - case ANNOTATED: setMafAnnotatedCols(); break; - } - log.info("MAF file has no header; assuming standard column order for the MAF type "+mafType); - } - } - - - if (tokens.length < expectedTokenCount) { - if ( ! tooFewColsWarned ) { - log.error("MAF line contains too few columns ("+tokens.length+"); this error is reported only once."); - tooFewColsWarned = true; - } - } - if (tokens.length > expectedTokenCount) { - if ( ! tooManyColsWarned ) { - log.warn("MAF line contains more columns than expected ("+tokens.length+"); extra columns discarded. This error is shown only once."); - tooManyColsWarned = true; - } - } - - if ( tokens[CHR_COL.getIndex()].equals("Chromosome") || tokens[CHR_COL.getIndex()].equals("chr")) return null; // if someone uses this codec manually and feeds it the header line multiple times... - // create a new feature from the line: - - int start = 0; - try { - start = Integer.parseInt(START_COL.getValue(tokens)); - } catch (NumberFormatException e) { - throw new UserException.MalformedFile("Missing or non-numeric start position in line:\n"+line,e); - } - int stop = 0 ; - try { - stop = Integer.parseInt(END_COL.getValue(tokens)); - } catch (NumberFormatException e) { - throw new UserException.MalformedFile("Missing or non-numeric stop position in line:\n"+line,e); - } - - String eventType="UNKNOWN"; - - String ref = REF_ALLELE_COL.getValue(tokens); - String alt1 = TUMOR_ALLELE1_COL.getValue(tokens); - String alt2 = TUMOR_ALLELE2_COL.getValue(tokens); - - if ( ref.equals("-") ) { - // insertion - eventType = INS; - stop-- ; // maf lists stop as first base after insertion, convert internally to vcf style - // perform some format validation: - - if ( alt1.equals("-") && alt2.equals("-") ) - throw new UserException.MalformedFile("Inconsistency in MAF: both alt alleles reported as ref ('-') for an insertion"); - - if ( ! alt1.equals("-") && ! alt2.equals("-") && ! alt1.equals(alt2) ) - throw new UserException.MalformedFile("Inconsistency in MAF: two different (non-ref) alt alleles reported for an insertion"); - - if ( stop != start ) - throw new UserException.MalformedFile("Inconsistency in MAF: end position for an insertion is not start+1"); - - } else { - if ( alt1.equals("-") || alt2.equals("-") ) { - // deletion - eventType = DEL; - start--; // maf lists start as the first deleted base; convert internally to vcf style - // perform some format validation: - - if ( ! alt1.equals("-") && ! alt1.equals(ref) ) - throw new UserException.MalformedFile("Inconsistency in MAF: non-deleted alt allele is not ref for a deletion"); - - if ( ! alt2.equals("-") && ! alt2.equals(ref) ) - throw new UserException.MalformedFile("Inconsistency in MAF: non-deleted alt allele is not ref for a deletion"); - - if ( (stop - start) != ref.length() ) - throw new UserException.MalformedFile("Inconsistency in MAF: deletion length is not end-start+1"); - - } else { - // no '-' alleles --> it's a snp/mnp - if ( ref.length() == 1 ) { - // it's a snp - eventType = SNP; - if ( stop != start ) - throw new UserException.MalformedFile("Inconsistency in MAF: start/end positions not equal for a SNP"); - } else { - // it's an mnp - eventType = MNP; - if ( (stop - start + 1) != ref.length() ) - throw new UserException.MalformedFile("Inconsistency in MAF: MNP length is not end-start+1"); - } - - if ( alt1.length() != ref.length() || alt2.length() != ref.length() ) - throw new UserException.MalformedFile("Inconsistency in MAF: lengths of ref and alt alleles for a SNP/MNP differ"); - if ( ! alt1.equals(ref) && ! alt2.equals(ref) && ! alt1.equals(alt2) ) - throw new UserException.MalformedFile("Inconsistency in MAF: two different non-ref alt alleles reported for a SNP/MNP"); - } - } - // if we got vartype column, make sure it makes sense: - if ( VARTYPE_COL.isSet(tokens) && ! tokens[VARTYPE_COL.getIndex()].equals(eventType) ) { - // special case: we annotate everything as MNP while MAF can have DNP/TNP, these are fine: - if ( eventType == MNP && ( - tokens[VARTYPE_COL.getIndex()].equals("DNP") && ref.length() == 2 || - tokens[VARTYPE_COL.getIndex()].equals("TNP") && ref.length() == 3) - ) {} // these are fine - else { - throw new UserException.MalformedFile("Inconsistency in MAF: variant looks like a "+eventType +" but annotated as "+ - tokens[VARTYPE_COL.getIndex()]); - } - } - MafFeature feature = new MafFeature(CHR_COL.getValue(tokens),start,stop); - - if ( ! extra ) return feature; // ignore additional fields unless we were explicitly asked to read those! - - feature.setVariantType(eventType); - feature.setRefAllele(ref); - feature.setObservedTumor(alt1,alt2); - feature.setTumorSample(TUMOR_SAMPLE_COL.getValue(tokens)); - feature.setNormalSample(NORMAL_SAMPLE_COL.getValue(tokens)); - - if ( HUGO_GENE_COL.isSet(tokens) ) feature.setHugoGeneSymbol(tokens[HUGO_GENE_COL.getIndex()]); - if ( VARCLASS_COL.isSet(tokens) ) feature.setVariantClassification(tokens[VARCLASS_COL.getIndex()]); - - return feature; - } - - public Class getFeatureType() { - return MafFeature.class; - } - - - /** Read and return the header, or null if there is no header. - * - * @return header object - */ - public Object readHeader(LineReader reader) { - return null; - } - - /** Set expected column indices for MafLite - * - */ - private void setMafLiteCols() { - BUILD_COL.setIndex(0); - CHR_COL.setIndex(1); - START_COL.setIndex(2); - END_COL.setIndex(3); - REF_ALLELE_COL.setIndex(4); - TUMOR_ALLELE1_COL.setIndex(5); - TUMOR_ALLELE2_COL.setIndex(6); - TUMOR_SAMPLE_COL.setIndex(7); - NORMAL_SAMPLE_COL.setIndex(8); - } - - private void setMafAnnotatedCols() { - BUILD_COL.setIndex(3); - CHR_COL.setIndex(4); - START_COL.setIndex(5); - END_COL.setIndex(6); - REF_ALLELE_COL.setIndex(10); - TUMOR_ALLELE1_COL.setIndex(11); - TUMOR_ALLELE2_COL.setIndex(12); - TUMOR_SAMPLE_COL.setIndex(15); - NORMAL_SAMPLE_COL.setIndex(16); - VARTYPE_COL.setIndex(9); - STRAND_COL.setIndex(7); - VARCLASS_COL.setIndex(8); - HUGO_GENE_COL.setIndex(0); - } - - private void setColumnsFromHeader(String[] tokens) { - Map colNames = new HashMap(); - for ( int i = 0 ; i < tokens.length ; i++ ) colNames.put(tokens[i],i); - - for ( Column c : allColumns ) c.setFromMap(colNames); - } - - -} - - -class Column { - int index ; - List names; - boolean required; - - Column(String name, boolean required) { - this.names = new ArrayList(); - this.names.add(name); - this.required = required; - this.index = -1; - } - - Column(String [] names, boolean required) { - this.names = new ArrayList(); - for ( int i = 0 ; i < names.length ; i++ ) this.names.add(names[i]); - this.required = required; - this.index = -1; - } - - public String getName() { return names.get(0); } - public Collection getNames() { return names; } - public void setName(String name) { - for ( String n : names ) { - if ( n.equals(name) ) return; - } - this.names.add( name ); - } - - public int getIndex() { return index; } - public void setIndex(int index) { this.index = index; } - public String getValue(String[] fields) { - if ( index < fields.length ) return fields[index]; - - if ( required ) throw new UserException.MalformedFile("In MAF file: required column "+getName()+" has index "+index+ - ", but only "+fields.length+ " fields are present in maf line"); - return null; - } - - /** Sets this column's index from the provided name->index map (i.e. searches for itself in the map). - * If column not found, throw_exception is true AND this column is required, then an exception will - * be thrown right away; otherwise returns quietely even if map does not contain this column. - * @param m - * @param throw_exception - */ - public void setFromMap(Map m, boolean throw_exception) { -// Integer i = null; - for ( String n : names ) { - if ( m.containsKey(n) ) { -// if ( i != null ) -// throw new UserException.MalformedFile("MAF file contains multiple columns with name or alternative names registered for single data field "+getName()); - // go with the first column name found; we assume here that column names have priorities: - // for instance, if the file has both 'Chromosome' and 'chr' columns, we will just take - // Chromosome and run with that - this.index = m.get(n); - return; - } - } - if ( this.required && throw_exception ) throw new UserException.MalformedFile("Required column "+getName()+" is missing from the maf file"); - this.index = -1; - } - -/** Sets this column's index from the provided name->index map (i.e. searches for itself in the map). - * If this column is required but not found in the map, then an exception will - * be thrown. - * @param m - */ - public void setFromMap(Map m) { - setFromMap(m,true); - } - - public boolean isSet() { return index > -1; } - - public boolean isSet(String[] fields) { return index > -1 && index < fields.length; } - -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java deleted file mode 100644 index 49d9713dd..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java +++ /dev/null @@ -1,394 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.features.maf; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jan 24, 2011 - * Time: 12:01:12 PM - * To change this template use File | Settings | File Templates. - */ -public class MafFeature implements Feature { - private String contig; // our contig location - private int start; // our starting location, zero based - private int stop; // our stopping location - - private String refAllele = "."; // the reference allele - private String[] observedTumAlleles = null; // The sequences of the observed alleles in tumor - private String[] observedNormAlleles = null; // The sequences of the observed alleles in normal - private String tumorSampleId = null; - private String normalSampleId = null; - private String hugoSymbol = null; - private Classification classification = null; - - public enum Type { - UNKNOWN,SNP,MNP,INS,DEL - }; - - public enum Classification { - Unclassified, Intergenic,Intron,Noncoding_transcript,UTR3,UTR5,Flank5,Silent,Missense, Nonsense, Splice_site, miRNA, - Frameshift, Inframe, Stop_deletion, Promoter,De_novo_start, De_novo_start_out_of_frame - } - - private Type type = Type.UNKNOWN; - - /** - * create the dbSNP feature, given the following information: - * - * @param contig the contig rsID - * @param start the start position, one based - * @param stop the stop position, one based - */ - MafFeature(String contig, - int start, - int stop) { - this.contig = contig; - this.start = start; - this.stop = stop; - } - - public void setVariantType(String t) { - type=Type.valueOf(t); - } - - public void setObservedTumor(String[] obs) { - observedTumAlleles = obs; - } - - public void setObservedTumor(String allele1, String allele2) { - observedTumAlleles = new String[2]; - observedTumAlleles[0] = allele1; - observedTumAlleles[1] = allele2; - } - - public void setRefAllele(String ref) { - this.refAllele = ref; - } - - public void setTumorSample(String sampleId) { - this.tumorSampleId = sampleId; - } - - public void setNormalSample(String sampleId) { - this.normalSampleId = sampleId; - } - - public String getRefBases() { - return refAllele; - } - - public String getHugoGeneSymbol() { - return hugoSymbol; - } - - public void setHugoGeneSymbol(String genename) { - int pos = genename.indexOf('|'); - if ( pos < 0 ) { - hugoSymbol = genename; - } else { - hugoSymbol = genename.substring(0,pos); - } - } - - /** - * Returns list of alleles (represented as strings) observed in Tumor. Returned alleles - * could be redundant (e.g. if we have homozygous non-ref at ploidy 2+). - * @return - */ - public List getObservedTumorAlleleList() { - return Arrays.asList(observedTumAlleles); - } - - /** - * Returns list of alleles (represented as strings) observed in Tumor. Returned alleles - * could be redundant (e.g. if we have homozygous non-ref at ploidy 2+). - * @return - */ - public List getObservedNormalAlleleList() { - if ( observedNormAlleles == null ) { - // if we got no ref allele observations recorded in the maf, we assume its ref/ref (somatic event) - List l = new ArrayList(2); - l.add(refAllele); - l.add(refAllele); - return l; - } - else return Arrays.asList(observedTumAlleles); - } - - /** Returns a (non-redundant) list of all distinct alleles - * observed at the site, plus a reference allele (whether it - * was actually observed or not). The reference allele is always returned as first - * element of the list. - * @return - */ - public List getAllAlleleList() { - List l = new ArrayList(); - l.add(refAllele); - for ( String a : observedTumAlleles ) { - if ( l.contains(a) ) continue; - l.add(a); - } - if ( observedNormAlleles != null ) { - for ( String a : observedNormAlleles ) { - if ( l.contains(a) ) continue; // already have this allele - l.add(a); - } - } - return l; - } - - /** Returns a (non-redundant) list of all distinct non-reference alleles - * observed at the site - * @return - */ - public List getAllNonRefAlleleList() { - List l = new ArrayList(); - for ( String a : observedTumAlleles ) { - if ( l.contains(a) ) continue; // already have this allele - if ( a.equals(refAllele)) continue; // allele is ref, we do not need it - l.add(a); - } - if ( observedNormAlleles != null ) { - for ( String a : observedNormAlleles ) { - if ( l.contains(a) ) continue; // already have this allele - if ( a.equals(refAllele)) continue; // allele is ref, we do not need it - l.add(a); - } - } - return l; - } - - public String getTumorSampleId() { return tumorSampleId; } - public String getNormalSampleId() { return normalSampleId; } - - public boolean isRefAllele(String a) { return refAllele.equals(a); } - - public Type getType() { return type; } - - public int lengthOnRef() { - switch ( type ) { - case SNP: - case MNP: - case DEL: - return refAllele.length(); - case INS: - return 0; - default: - throw new StingException("Unrecognized event type in Maf record: "+type); - } - } - - public boolean isSomatic() { - if ( observedTumAlleles[0].equals(refAllele) && observedTumAlleles[1].equals(refAllele) ) return false; // tumor is ref - // we get here only if tumor is non-ref - if ( observedNormAlleles == null ) return true; // norm alleles are omitted from maf only if they are all ref - if ( observedNormAlleles[0].equals(refAllele) && observedNormAlleles[1].equals(refAllele) ) return true; - return false; - } - - public void setVariantClassification(String s) { - if ( s.equals("IGR") ) { classification = Classification.Intergenic ; return; } - if ( s.equals("Intron") ) { classification = Classification.Intron ; return; } - if ( s.equals("3'UTR") || s.equals("3'-UTR")) { classification = Classification.UTR3 ; return; } - if ( s.equals("5'UTR") || s.equals("5'-UTR")) { classification = Classification.UTR5 ; return; } - if ( s.equals("5'-Flank") ) { classification = Classification.Flank5 ; return; } - if ( s.equals("Silent") || s.equals("Synonymous")) { classification = Classification.Silent ; return; } - if ( s.equals("Non-coding_Transcript")) { classification = Classification.Noncoding_transcript; return; } - if ( s.equals("Missense") || s.equals("Missense_Mutation") ) { classification = Classification.Missense ; return; } - if ( s.equals("Nonsense_Mutation") || s.equals("Nonsense") ) { classification = Classification.Nonsense ; return; } - if ( s.equals("Splice_Site") ) { classification = Classification.Splice_site ; return; } - if ( s.equals("miRNA") ) { classification = Classification.miRNA ; return; } - if ( s.equals("Frame_Shift_Ins") ) { classification = Classification.Frameshift ; return; } - if ( s.equals("Frame_Shift_Del") ) { classification = Classification.Frameshift ; return; } - if ( s.equals("In_Frame_Ins") ) { classification = Classification.Inframe ; return; } - if ( s.equals("In_Frame_Del") ) { classification = Classification.Inframe ; return; } - if ( s.equals("Stop_Codon_Del") ) { classification = Classification.Stop_deletion ; return; } - if ( s.equals("Splice_Site_Del") ) { classification = Classification.Splice_site ; return; } - if ( s.equals("Splice_Site_Ins") ) { classification = Classification.Splice_site ; return; } - if ( s.equals("Splice_Site_SNP") ) { classification = Classification.Splice_site ; return; } - if ( s.equals("Promoter") ) { classification = Classification.Promoter ; return; } - if ( s.equals("De_novo_Start") ) { classification = Classification.De_novo_start ; return; } - if ( s.equals("De_novo_Start_OutOfFrame") ) { classification = Classification.De_novo_start_out_of_frame ; return; } - if ( s.equals("TX-REF-MISMATCH") ) { classification = Classification.Unclassified ; return; } - throw new UserException.MalformedFile("Unknown variant classification: " + s); - } - - public Classification getVariantClassification() { - return classification; - } - - /* - * the required getting and setter methods - */ - - public String getChr() { - return contig; - } - - public int getStart() { - return start; - } - - public int getEnd() { - return stop; - } - -} - -class MafAdaptor implements VariantContextAdaptors.VCAdaptor { - /** - * Converts Maf features to VariantContext. - * @return MafFeature. - */ - @Override - public Class getAdaptableFeatureType() { return MafFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a MafFeature - * @return a VariantContext object - */ -// VariantContext convert(String name, Object input) { -// return convert(name, input, null); -// } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a MafFeature - * @param ref the reference context - * @return a VariantContext object - */ - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - - if ( ref == null ) - throw new UnsupportedOperationException("Conversion from MAF to VariantContext requires a reference context, null received"); - - MafFeature maf = (MafFeature)input; - if ( ! Allele.acceptableAlleleBases(maf.getRefBases()) ) - return null; - - List alleles = new ArrayList(); - - Allele refAllele = Allele.create(maf.getRefBases(), true); - // add the reference allele: - alleles.add(refAllele); - - // add all of the alt alleles - for ( String alt : maf.getAllNonRefAlleleList() ) { - if ( ! Allele.acceptableAlleleBases(alt) ) { - //System.out.printf("Excluding dbsnp record %s%n", dbsnp); - return null; - } - alleles.add(Allele.create(alt, false)); - } - - // make a mapping from sample to genotype - - String normalSample = maf.getNormalSampleId(); - String tumorSample = maf.getTumorSampleId(); - -// String[] genotypeStrings = hapmap.getGenotypes(); - - Map genotypes = new HashMap(2); - - addGenotype(genotypes, normalSample, maf.getObservedNormalAlleleList(),maf.getRefBases()); - addGenotype(genotypes,tumorSample,maf.getObservedTumorAlleleList(),maf.getRefBases()); - - - HashMap attrs = new HashMap(10); - // fill attributes: - if ( maf.getHugoGeneSymbol() != null && ! maf.getHugoGeneSymbol().equals("Unknown")) - attrs.put("Gene",maf.getHugoGeneSymbol()); - - if ( maf.isSomatic() ) { - attrs.put(VCFConstants.SOMATIC_KEY,true); - attrs.put("SS","Somatic"); - } else { - attrs.put("SS","Germline"); - } - - if ( maf.getVariantClassification() != null ) { - switch(maf.getVariantClassification()) { - case Intergenic: attrs.put("VC","Genomic"); break; - case Intron: attrs.put("VC","Intron"); break; - case Noncoding_transcript: attrs.put("VC","Noncoding_transcript"); break; - case UTR3: attrs.put("VC","3'UTR"); break; - case UTR5: attrs.put("VC","5'UTR"); break; - case Flank5: attrs.put("VC","5'flank"); break; - case Promoter: attrs.put("VC","5'flank"); break; - case De_novo_start: attrs.put("VC","De_novo_start"); break; - case De_novo_start_out_of_frame: attrs.put("VC","De_novo_start_out_of_frame"); break; - case Silent: attrs.put("VC","Silent"); break; - case Missense: attrs.put("VC","Missense"); break; - case Nonsense: attrs.put("VC","Nonsense"); break; - case Splice_site: attrs.put("VC","Splice_site"); break; - case miRNA: attrs.put("VC","miRNA"); break; - case Frameshift: attrs.put("VC","Frameshift"); break; - case Inframe: attrs.put("VC","Inframe"); break; - case Stop_deletion: attrs.put("VC","Stop_codon_deletion"); - case Unclassified: attrs.put("VC","Unclassified"); - default: - } - } - - attrs.put("VT",maf.getType()); - -// attrs.put(VariantContext.ID_KEY, hapmap.getName()); - int end = maf.getEnd(); - VariantContext vc = new VariantContext(name, maf.getChr(), maf.getStart(), end, alleles, - genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); - return vc; - } - - private void addGenotype(Map dest, String sampleId, List alleles, String refAllele) { - List myAlleles = new ArrayList(2); - - boolean success = true; - - for ( String a : alleles ) { - if ( a.isEmpty() || a.contains("N") || a.contains(".")) return; // bad allele found - myAlleles.add(Allele.create(a,refAllele.equals(a))); - } - dest.put(sampleId, new Genotype(sampleId,myAlleles)); - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java deleted file mode 100755 index 437fed843..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; - -/** - * Validates the calls on a ROD track using a BAM dataset. - * - * @author carneiro - * @since Mar 3, 2011 - * @help.summary Validates the calls on a ROD track using a BAM dataset. - */ - -@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="alleles",type=VariantContext.class)) -@Allows(value={DataSource.READS, DataSource.REFERENCE}) - -// Ugly fix because RodWalkers don't have access to reads -@By(DataSource.REFERENCE) -@Reference(window=@Window(start=-200,stop=200)) - - -public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible { - - @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false) - protected VCFWriter vcfWriter = null; - - @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false) - private boolean bamIsTruth = false; - - @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) - private int mbq = -1; - - @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false) - private double deletions = -1; - - @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false) - private double callConf = -1; - - @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false) - private double emitConf = -1; - - @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false) - private int minDepth = -1; - - @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false) - private String sample = ""; - - - - - private String compName = "alleles"; - private UnifiedGenotyperEngine snpEngine; - private UnifiedGenotyperEngine indelEngine; - - public static class CountedData { - private long nAltCalledAlt = 0L; - private long nAltCalledRef = 0L; - private long nRefCalledAlt = 0L; - private long nRefCalledRef = 0L; - private long nNotConfidentCalls = 0L; - private long nUncovered = 0L; - - /** - * Adds the values of other to this, returning this - * @param other the other object - */ - public void add(CountedData other) { - nAltCalledAlt += other.nAltCalledAlt; - nAltCalledRef += other.nAltCalledRef; - nRefCalledAlt += other.nRefCalledAlt; - nRefCalledRef += other.nRefCalledRef; - nUncovered += other.nUncovered; - nNotConfidentCalls += other.nNotConfidentCalls; - } - } - - - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - - List rodList = this.getToolkit().getRodDataSources(); - if ( rodList.size() != 1 ) - throw new UserException.BadInput("You should provide exactly one genotype VCF"); - if ( !rodList.get(0).getName().equals(compName)) - throw new UserException.BadInput("The ROD track has to be named \""+ compName +"\". Not " + rodList.get(0).getName()); - - - // Initialize VCF header - if (vcfWriter != null) { - Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), compName); - Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); - headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); - } - - // Filling in SNP calling arguments for UG - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; - if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; - if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; - if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; - - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); - - // Adding the INDEL calling arguments for UG - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; - indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); - - // make sure we have callConf set to the threshold set by the UAC so we can use it later. - callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - final CountedData counter = new CountedData(); - - // For some reason RodWalkers get map calls with null trackers - if( tracker == null ) - return counter; - - VariantContext vcComp = tracker.getVariantContext(ref, compName, null, context.getLocation(), false); - if( vcComp == null ) - return counter; - - //todo - not sure I want this, may be misleading to filter extended indel events. - if (isInsideExtendedIndel(vcComp, ref)) - return counter; - - // Do not operate on variants that are not covered to the optional minimum depth - if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { - counter.nUncovered = 1L; - return counter; - } - - VariantCallContext call; - if ( vcComp.isSNP() ) - call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); - else if ( vcComp.isIndel() ) { - call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); - } - else { - logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); - return counter; - } - - - boolean writeVariant = true; - - if (bamIsTruth) { - if (call.confidentlyCalled) { - // If truth is a confident REF call - if (call.isVariant()) { - if (vcComp.isVariant()) - counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same? - else - counter.nAltCalledRef = 1L; - } - // If truth is a confident ALT call - else { - if (vcComp.isVariant()) - counter.nRefCalledAlt = 1L; - else - counter.nRefCalledRef = 1L; - } - } - else { - counter.nNotConfidentCalls = 1L; - writeVariant = false; - } - } - else { - if (!vcComp.hasAttribute("GV")) - throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); - - - - if (call.isCalledAlt(callConf)) { - if (vcComp.getAttribute("GV").equals("T")) - counter.nAltCalledAlt = 1L; - else - counter.nRefCalledAlt = 1L; - } - else if (call.isCalledRef(callConf)) { - if (vcComp.getAttribute("GV").equals("T")) - counter.nAltCalledRef = 1L; - else - counter.nRefCalledRef = 1L; - } - else { - counter.nNotConfidentCalls = 1L; - writeVariant = false; - } - } - - if (vcfWriter != null && writeVariant) { - if (!vcComp.hasAttribute("callStatus")) { - MutableVariantContext mvc = new MutableVariantContext(vcComp); - mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); - vcfWriter.add(mvc, ref.getBase()); - } - else - vcfWriter.add(vcComp, ref.getBase()); - } - return counter; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - public CountedData reduceInit() { - return new CountedData(); - } - - public CountedData treeReduce( final CountedData sum1, final CountedData sum2) { - sum2.add(sum1); - return sum2; - } - - public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) { - reduceSum.add(mapValue); - return reduceSum; - } - - public void onTraversalDone( CountedData reduceSum ) { - double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt)); - double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef)); - double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef)); - double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100; - logger.info(String.format("Resulting Truth Table Output\n\n" + - "---------------------------------------------------\n" + - "\t\t|\tALT\t|\tREF\t\n" + - "---------------------------------------------------\n" + - "called alt\t|\t%d\t|\t%d\n" + - "called ref\t|\t%d\t|\t%d\n" + - "---------------------------------------------------\n" + - "positive predictive value: %f%%\n" + - "negative predictive value: %f%%\n" + - "---------------------------------------------------\n" + - "sensitivity: %f%%\n" + - "specificity: %f%%\n" + - "---------------------------------------------------\n" + - "not confident: %d\n" + - "not covered: %d\n" + - "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java deleted file mode 100644 index 06acd5789..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java +++ /dev/null @@ -1,97 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.ArrayList; -/** - * Reads the .baselikelihoods file for various walkers in the HLA caller suite - * @author shermanjia - */ -public class BaseLikelihoodsFileReader { - double[][] baseLikelihoods; - int[] positions; - ArrayList polymorphicSites = new ArrayList(); - - public Integer[] GetPolymorphicSites(){ - return polymorphicSites.toArray(new Integer[polymorphicSites.size()]); - } - - public double[][] GetBaseLikelihoods(){ - return baseLikelihoods; - } - - public int[] GetPositions(){ - return positions; - } - - public void ReadFile(String filename, boolean findPolymorphicSites){ - try{ - //System.out.printf("INFO Reading base likelihoods file ... "); - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null, pos = null; - //Determine size of file - int n = 0; char ref; - while ((strLine = br.readLine()) != null) { - if (strLine.indexOf("INFO") == -1){n++;} - } - - baseLikelihoods = new double[n][10]; - positions = new int[n]; - double[] localLikelihoods = new double[10]; - - - //System.out.printf("%s lines of data found ... ",n); - in.close(); - //Read and store data - - fstream = new FileInputStream(filename); - in = new DataInputStream(fstream); - br = new BufferedReader(new InputStreamReader(in)); - n = 0; - while ((strLine = br.readLine()) != null) { - if (strLine.indexOf("INFO") == -1){ - s = strLine.split("\\t"); - pos = s[0].split(":"); - ref = s[1].charAt(0); - positions[n] = Integer.valueOf(pos[1]); - for (int i = 3; i <= 12; i++){ - baseLikelihoods[n][i-3] = Double.valueOf(s[i]); - localLikelihoods[i-3] = baseLikelihoods[n][i-3]; - } - if (findPolymorphicSites){ - if (IsPolymorphicSite(localLikelihoods,ref)){ - polymorphicSites.add(positions[n]); - } - } - n++; - } - } - - }catch (Exception e){//Catch exception if any - System.err.println("BaseLikelihoodsFileReader Error: " + e.getMessage()); - } - } - - private int IndexOf(char c){ - switch(c){ - case 'A': return 0; - case 'C': return 4; - case 'G': return 7; - case 'T': return 9; - default: return -1; - } - } - - private boolean IsPolymorphicSite(double[] likelihoods, char ref){ - boolean isPolymorphicSite = false; - double homreflikelihood = likelihoods[IndexOf(ref)]; - for (int i = 0; i < 10; i++){ - if (likelihoods[i] > homreflikelihood){ - isPolymorphicSite = true; - } - } - return isPolymorphicSite; - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java deleted file mode 100644 index d8e8bea94..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.Enumeration; -import java.util.Vector; -import java.util.Collections; -import java.io.PrintStream; - -/** - * Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\ -nicity Caucasian | grep -v "INFO" | grep -v "DONE!" > OUTPUT.allelelikelihoods - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class CalculateAlleleLikelihoodsWalker extends ReadWalker { - @Output - public PrintStream out; - - @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true) - public String baseLikelihoodsFile = ""; - - @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) - public boolean DEBUG = false; - - @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) - public String debugAlleles = ""; - - @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) - public boolean FREQUENT = false; - - @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) - public String ethnicity = "CaucasianUSA"; - - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; - String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; - String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles4Digit"; - String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.txt"; - String HLA2DigitFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY_2DIGIT.txt"; - - Hashtable AlleleFrequencies,UniqueAlleles,Alleles2Digit; - - CigarParser formatter = new CigarParser(); - double[][] baseLikelihoods; - int[] positions; - boolean loaded = false; - - String[] HLAnames, HLAreads, HLAnames2, HLAreads2; - Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2; - ArrayList HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch; - ArrayList HLAstartposAL, HLAstopposAL; - - public Integer reduceInit() { - if (!loaded){ - loaded = true; - BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader(); - baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, false); - baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods(); - positions = baseLikelihoodsReader.GetPositions(); - - HLAnamesAL = new ArrayList(); - HLAreadsAL = new ArrayList(); - HLAstartposAL = new ArrayList(); - HLAstopposAL = new ArrayList(); - - out.printf("INFO Reading HLA alleles ... "); - HLAFileReader HLADictionaryReader = new HLAFileReader(); - HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetSequences(); - HLAnames = HLADictionaryReader.GetNames(); - HLAstartpos = HLADictionaryReader.GetStartPositions(); - HLAstoppos = HLADictionaryReader.GetStopPositions(); - - HLADictionaryReader = new HLAFileReader(); - HLADictionaryReader.ReadFile(HLA2DigitFile); - HLAreads2 = HLADictionaryReader.GetSequences(); - HLAnames2 = HLADictionaryReader.GetNames(); - HLAstartpos2 = HLADictionaryReader.GetStartPositions(); - HLAstoppos2 = HLADictionaryReader.GetStopPositions(); - out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); - - //out.printf("INFO Common alleles:\n"); - for (int i = 1; i < UniqueAlleles.size(); i++){ - //out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]); - } - //out.printf("INFO Reading HLA dictionary ..."); - - - } - return 0; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - //HLAnamesAL.add(read.getReadName()); - //HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString())); - //HLAstartposAL.add(read.getAlignmentStart()); - //HLAstopposAL.add(read.getAlignmentEnd()); - //out.printf("INFO\t%s\t%s\t%s\t%s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),formatter.FormatRead(read.getCigarString(), read.getReadString())); - return 1; - } - - private int GenotypeIndex(char a, char b){ - switch(a){ - case 'A': - switch(b){ - case 'A': return 0; - case 'C': return 1; - case 'G': return 2; - case 'T': return 3; - }; - case 'C': - switch(b){ - case 'A': return 1; - case 'C': return 4; - case 'G': return 5; - case 'T': return 6; - }; - case 'G': - switch(b){ - case 'A': return 2; - case 'C': return 5; - case 'G': return 7; - case 'T': return 8; - }; - case 'T': - switch(b){ - case 'A': return 3; - case 'C': return 6; - case 'G': return 8; - case 'T': return 9; - }; - default: return -1; - } - } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer numreads) { - //out.printf("Done! %s alleles found\n", numreads); - //HLAnames = HLAnamesAL.toArray(new String[numreads]); - //HLAreads = HLAreadsAL.toArray(new String[numreads]); - //HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]); - //HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]); - - double[][] AlleleLikelihoods = new double[numreads][numreads]; - - String name1, name2; - double frq1, frq2; - - - double minfrq = 0; - if (FREQUENT){ - minfrq = 0.0001; - } - int numcombinations = 0; - out.printf("NUM\tAllele1\tAllele2\tSSG\n"); - - //debugging specific alleles - int index1 = -1, index2 = -1; - if (!debugAlleles.equals("")){ - String s[] = debugAlleles.split(","); - for (int i = 0; i < numreads; i++){ - if (HLAnames[i].equals(s[0])){ - index1 = i; - } - if (HLAnames[i].equals(s[1])){ - index2 = i; - } - if (index1 > -1 && index2 > -1){ - out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2); - double dl = CalculateLikelihood(index1,index2,HLAreads2,true); - break; - } - } - } - - //Pre-process homozygous combinations to determine top possible alleles (for efficiency) - int numreads2 = HLAnames2.length; - Alleles2Digit = new Hashtable(); - Loci = new ArrayList(); - double[] AlleleLikelihoods2 = new double[numreads]; - for (int i = 0; i < numreads; i++){ - name1 = HLAnames[i].substring(4); - String [] n1 = name1.split("\\*"); - numcombinations++; - AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false); - if (AlleleLikelihoods2[i] < 0){ - name2 = n1[0] + "*" + n1[1].substring(0, 2); - if (!Loci.contains(n1[0])){Loci.add(n1[0]);} - if (!Alleles2Digit.containsKey(name2)){ - Alleles2Digit.put(name2, AlleleLikelihoods2[i]); - }else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){ - Alleles2Digit.put(name2, AlleleLikelihoods2[i]); - } - } - } - - //Sort alleles at 2 digit resolution for each locus - AllelesToSearch = new ArrayList(); - for (int i = 0; i < Loci.size(); i++){ - Enumeration k = Alleles2Digit.keys(); - Hashtable AllelesAtLoci = new Hashtable(); - - //find alleles at the locus - while( k.hasMoreElements() ){ - name1 = k.nextElement().toString(); - String [] n1 = name1.split("\\*"); - if (Loci.get(i).equals(n1[0])){ - AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1); - } - } - - //Sort alleles at locus, mark top six 2-digit classes for deep search - int num = 1; - Vector v = new Vector(AllelesAtLoci.keySet()); - Collections.sort(v); - for (Enumeration e = v.elements(); e.hasMoreElements();) { - Double key = Double.valueOf(e.nextElement().toString()); - String allele = AllelesAtLoci.get(key).toString(); - if (num <= 10){ - AllelesToSearch.add(allele); - - num++; - } - //out.printf("%s\t%s\n",allele,key); - } - } - - //Iterate through allele pairs to calculate likelihoods - if (true){ - numcombinations = 0; - for (int i = 0; i < numreads; i++){ - name1 = HLAnames[i].substring(4); - String [] n1 = name1.split("\\*"); - if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){ - //out.printf("1: %s\n",name1); - //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString()); - //if (frq1 > minfrq){ - for (int j = i; j < numreads; j++){ - name2 = HLAnames[j].substring(4); - String [] n2 = name2.split("\\*"); - if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){ - if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){ - numcombinations++; - AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false); - if (AlleleLikelihoods[i][j] < 0){ - out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]); - } - } - } - } - } - } - } - } - - private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){ - //Calculates likelihood for specific allele pair - String read1 = HLAalleles[a1]; - String read2 = HLAalleles[a2]; - int start1 = HLAstartpos[a1]; - int start2 = HLAstartpos[a2]; - int stop1 = HLAstoppos[a1]; - int stop2 = HLAstoppos[a2]; - double likelihood = 0; - int pos, index; - char c1, c2; - - - for (int i = 0; i < positions.length; i++){ - pos = positions[i]; - if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){ - index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2)); - if (index > -1){ - likelihood = likelihood + baseLikelihoods[i][index]; - if (debug){ - c1 = read1.charAt(pos-start1); - c2 = read2.charAt(pos-start2); - out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood); - } - } - } - } - return likelihood; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java deleted file mode 100644 index daae2fbd5..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.pileup.FragmentPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.io.PrintStream; - -/** - * Calculates the probability of observing data for each genotype at every position. NOTE: run FindClosestAllele first to create .filter file. Usage: java -jar GenomeAnalysisTK.jar -T CalculateBaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval [-filter INPUT.filter -minAllowedMismatches 7] | grep -v "INFO" | grep -v "MISALIGNED" > OUTPUT.baselikelihoods - * @author shermanjia - */ -public class CalculateBaseLikelihoodsWalker extends LocusWalker>{ - @Output - public PrintStream out; - - @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) - public boolean DEBUG = false; - @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) - public String inputAlleles = ""; - @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) - public String filterFile = ""; - @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false) - public int MAXALLOWEDMISMATCHES = 6; - @Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false) - public int MINREQUIREDMATCHES = 0; - - int[][] LOD, LikelihoodScores; - ArrayList ReadsToDiscard = new ArrayList(); - - ArrayList AllReads = new ArrayList(); - ArrayList AllReadNames = new ArrayList(); - - boolean dataLoaded = false; - - - //Loads reads to filter - public Pair reduceInit() { - if (!dataLoaded){ - dataLoaded = true; - - if (!filterFile.equals("")){ - out.printf("INFO Reading properties file ... "); - SimilarityFileReader similarityReader = new SimilarityFileReader(); - similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES); - ReadsToDiscard = similarityReader.GetReadsToDiscard(); - out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); - for (int i = 0; i < ReadsToDiscard.size(); i++){ - out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString()); - } - } - } - return new Pair(0l,0l); - } - - - - private void InitializeVariables(int n){ - LOD = new int[n][n]; - LikelihoodScores = new int[n][n]; - for (int i = 0; i < n; i++){ - - for (int j = 0; j 0 ) { - - int numAs = 0, numCs = 0, numGs = 0, numTs = 0; - //if (DEBUG){ - out.printf("%s\t%s\t", context.getLocation(),(char)ref.getBase()); - //} - - //Calculate posterior probabilities - DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(); - - - for ( PileupElement p : context.getBasePileup() ) { - byte base = p.getBase(); - if (!ReadsToDiscard.contains(p.getRead().getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) { - G.add(p, true, false, 0); - //if (DEBUG){ - if (base == 'A'){numAs++;} - else if (base == 'C'){numCs++;} - else if (base == 'T'){numTs++;} - else if (base == 'G'){numGs++;} - //} - - } - } - //if (DEBUG) { - out.printf("A[%s]C[%s]T[%s]G[%s]",numAs,numCs,numTs,numGs); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - out.printf("\t%.2f",G.getLikelihood(g)); - } - out.printf("\n"); - //} - - } - return context.size(); - } - - public Pair reduce(Integer value, Pair sum) { - long left = value.longValue() + sum.getFirst(); - long right = sum.getSecond() + 1l; - return new Pair(left, right); - } - - public void onTraversalDone(Pair result) { - - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java deleted file mode 100644 index 2efb0167e..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ /dev/null @@ -1,922 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.FragmentPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.io.*; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.List; - -/** - * Original Call HLA walker (older). Look here for inspiration, but not for the most recent tools - * @author shermanjia - */ -public class CallHLAWalker extends LocusWalker>{ - @Output - public PrintStream out; - - @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) - public boolean suppressPrinting = false; - - @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) - public boolean DEBUG = false; - - @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) - public String inputAlleles = ""; - - @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) - public String ethnicity = "Caucasian"; - - @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) - public String filterFile = ""; - - String al1 = "", al2 = "", al3 = "", al4 = ""; - - - //String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam"; - //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam"; - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; - - //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam"; - //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.4digitUnique.sam"; - //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/CEU_Founders_HLA.freq"; - - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; - String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; - String AlleleFrequencyFile; - - - ArrayList HLAreads = new ArrayList(); - ArrayList HLAcigars = new ArrayList(); - ArrayList HLAnames = new ArrayList(); - ArrayList HLApositions = new ArrayList(); - ArrayList ReadsToFilter = new ArrayList(); - - ArrayList AllReads = new ArrayList(); - ArrayList AllReadNames = new ArrayList(); - - int[] HLAstartpos; - int[] HLAstoppos; - int numHLAlleles = 0; - int numInterval = 1; - double[] SingleAlleleFrequencies; - double[][] LOD; String[][] Alleles; - double[][] LikelihoodScores; - double[][] PhasingScores; - double[][]PhasingProbabilities; - double[][] CombinedAlleleFrequencies; - int j1, k1, j2, k2; - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - double likelihoodsumA = 0.0, likelihoodsumB = 0.0, likelihoodsumC = 0.0; - double inverseMaxProbA = 0.0, inverseMaxProbB = 0.0, inverseMaxProbC = 0.0; - - Hashtable AlleleFrequencies = new Hashtable(); - - Hashtable Scores = new Hashtable(); - Hashtable SNPs = new Hashtable(); - ArrayList SNPchars = new ArrayList(); - ArrayList SNPlocations = new ArrayList(); - Integer SNPcount = 0; - int[][] SNPcorrelation; - double[][] SNPcorrelationProb; - String[][] SNPhaplotypes; - - boolean DatabaseLoaded = false; - boolean PrintedOutput = false; - - public Pair reduceInit() { - - if (!DatabaseLoaded){ - try{ - //Load sequences corresponding to HLA alleles from sam file - if (!inputAlleles.equals("")){ - String[] str = inputAlleles.split(","); - al1 = str[0]; - al2 = str[1]; - al3 = str[2]; - al4 = str[3]; - } - - //set ethnic group to look up allele frequencies - if (ethnicity.equals("Black")){ - AlleleFrequencyFile = BlackAlleleFrequencyFile; - }else{ - AlleleFrequencyFile = CaucasianAlleleFrequencyFile; - } - - out.printf("Reading HLA database ..."); - FileInputStream fstream = new FileInputStream(HLAdatabaseFile); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int i = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - if (s.length>=10){ - //Parse the reads with cigar parser - HLAreads.add(CigarFormatted(s[5],s[9])); - HLAcigars.add(s[5]); - HLAnames.add(s[0]); - HLApositions.add(s[3]); - if (s[0].indexOf("HLA_A") > -1){ - if (iAstart < 0){iAstart=i;} - iAstop = i; i++; - }else if (s[0].indexOf("HLA_B") > -1){ - if (iBstart < 0){iBstart=i;} - iBstop = i; i++; - }else if (s[0].indexOf("HLA_C") > -1){ - if (iCstart < 0){iCstart=i;} - iCstop = i; i++; - } - } - } - in.close(); - int n = HLApositions.size(); numHLAlleles = n; - HLAstartpos = new int[n]; HLAstoppos = new int[n]; - SingleAlleleFrequencies = new double[n]; - LOD = new double[n][n]; - LikelihoodScores = new double[n][n]; - PhasingScores = new double[n][n]; - PhasingProbabilities = new double[n][n]; - CombinedAlleleFrequencies = new double[n][n]; - - for (i = 0; i < n; i++){ - //Find start and stop positions for each allele - HLAstartpos[i]=Integer.parseInt(HLApositions.get(i)); - HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; - SingleAlleleFrequencies[i]=0.0; - //Initialize matrix of probabilities / likelihoods - for (int j = 0; j 10){ - ReadsToFilter.add(s[0]); - } - count++; - } - in.close(); - out.printf("Done! %s reads to exclude\n",count); - }catch (Exception e){//Catch exception if any - System.err.println("CallHLAWalker Error: " + e.getMessage()); - } - } - DatabaseLoaded = true; - out.printf("Comparing reads to database ...\n"); - - //For debugging: prints which HLA alleles were indexed before - if (j1 > k1){int tmp = k1; k1 = j1; j1 = tmp;} - if (j2 > k2){int tmp = k2; k2 = j2; j2 = tmp;} - - if (DEBUG){ - out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); - out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2)); - } - } - out.printf("Computing for interval %s...\n",numInterval); - numInterval++; - return new Pair(0l,0l); - } - - - - private String CigarFormatted(String cigar, String read){ - // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur - String formattedRead = ""; char c; String count; - int cigarPlaceholder = 0; int subcigarLength = 0; - int readPlaceholder = 0; int subreadLength = 0; - - //reads cigar string - for (int i = 0; i < cigar.length(); i++){ - c = cigar.charAt(i); - if (c == 'M'){ - //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence - subcigarLength = i-cigarPlaceholder; - count = cigar.substring(cigarPlaceholder, i); - - subreadLength = Integer.parseInt(count); - formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength); - - //increment placeholders - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'I'){ - //***NOTE: To be modified later if needed (insertions removed here)*** - - //If reaches I for insertion, get number before 'I' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment placeholders without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'H' || c == 'S'){ - //(H = Headers or S = Soft clipped removed here)*** - - //If reaches H for insertion, get number before 'H' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - } else if (c == 'D'){ - //If reaches D for deletion, insert 'D' into sequence as placeholder - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //Add one 'D' for each deleted base - String deletion = ""; - for (int j = 1; j <= subreadLength; j++){ - deletion = deletion + "D"; - } - - //update placeholders - formattedRead = formattedRead + deletion; - cigarPlaceholder = i+1; - } - - } - return formattedRead; - } - - private static int unsignedByteToInt(byte b) { - //converts base quality from byte to int (not really needed) - return (int) b & 0xFF; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - List reads = context.getReads(); - List offsets = context.getOffsets(); - GenomeLoc Gloc = context.getLocation(); - - //Create pileup of reads at this locus - ReadBackedPileup pileup = context.getPileup(); - - long loc = context.getPosition(); - if( context.size() > 0 ) { - //out.printf("RG for first read: %s%n",context.getReads().get(0).getReadName()); - int numAs = 0, numCs = 0, numGs = 0, numTs = 0,depth = 0; - String c1 = "", c2 = ""; - long pos_k = 0, pos_j = 0; - - //Debugging purposes: print location, reference base, pileup, and count (before quality filtering) - if (DEBUG){ - out.printf("%s\t", context.getLocation()); - out.printf("ref=%s\t", ref.getBase()); - } - - //Calculate posterior probabilities - DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(); - - for ( PileupElement p : context.getBasePileup() ) { - byte base = p.getBase(); - int mapquality = p.getMappingQual(); - if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { - String readname = p.getRead().getReadName(); - if (ReadsToFilter.contains(readname)){ - if (DEBUG){ - out.printf("\n%s %s %s %s\n",readname,p.getRead().getAlignmentStart(),p.getRead().getAlignmentEnd(),base); - } - }else{ - //consider base in likelihood calculations if it looks good and has high mapping score - G.add(p, true, false, 0); - if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(p.getRead());} - if (base == 'A'){numAs++; depth++;} - else if (base == 'C'){numCs++; depth++;} - else if (base == 'T'){numTs++; depth++;} - else if (base == 'G'){numGs++; depth++;} - } - } - } - - //Debugging purposes - if (DEBUG) {out.printf("A[%s]C[%s]T[%s]G[%s]\t",numAs,numCs,numTs,numGs);} - - if (depth > 0){ - //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype - Scores = new Hashtable(); - Double likelihood = 0.0; double maxlikelihood = 0.0; - for ( DiploidGenotype g : DiploidGenotype.values() ) { - likelihood = G.getLikelihood(g); - if (maxlikelihood == 0.0 || likelihood > maxlikelihood){maxlikelihood = likelihood;} - Scores.put(g.toString(), likelihood); - //also hash other combination not stored by DiploidGenotype - if (g.toString().equals("AC")) { - Scores.put("CA", likelihood); - } else if (g.toString().equals("AG")){ - Scores.put("GA", likelihood); - } else if (g.toString().equals("AT")){ - Scores.put("TA", likelihood); - } else if (g.toString().equals("CG")){ - Scores.put("GC", likelihood); - } else if (g.toString().equals("CT")){ - Scores.put("TC", likelihood); - } else if (g.toString().equals("GT")){ - Scores.put("TG", likelihood); - } - } - - //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0. - String homref = String.valueOf(ref.getBaseAsChar())+String.valueOf(ref.getBaseAsChar()); - Double homreflikelihood = Double.parseDouble((String) Scores.get(homref).toString()); - - //Add SNP if it is a SNP and hasn't been added before - for ( DiploidGenotype g : DiploidGenotype.values() ) { - likelihood = G.getLikelihood(g); - if ((likelihood > homreflikelihood) && (likelihood == maxlikelihood) && (!SNPs.containsKey(Long.toString(loc)))){ - SNPcount++; - SNPs.put(Long.toString(loc),SNPcount); - SNPlocations.add(Integer.valueOf(Long.toString(loc))); - SNPchars.add(g.toString()); - } - } - - //Update likelihood for each combinations of alleles - String r1 = "", r2 = "", s1 = "", s2 = ""; - for (int j = 0; j < numHLAlleles; j++){ - //check if allele 1 overlaps current position - if (loc >= HLAstartpos[j] && loc <= HLAstoppos[j]){ - pos_j = loc - HLAstartpos[j]; - c1 = Character.toString(Character.toUpperCase(HLAreads.get(j).charAt((int) pos_j))); - - //Extract bases for HLA alleles indicated in reduceInit (for debugging) - if (j == j1) r1 = c1; - if (j == k1) r2 = c1; - if (j == j2) s1 = c1; - if (j == k2) s2 = c1; - - //Only check HLA A-A, B-B, C-C combinations - int kStart = 0, kStop = 0; - if (j >= iAstart && j <= iAstop){ - kStart = iAstart; kStop = iAstop; - } else if (j >= iBstart && j <= iBstop){ - kStart = iBstart; kStop = iBstop; - } else if (j >= iCstart && j <= iCstop){ - kStart = iCstart; kStop = iCstop; - } - - //Fill half-matrix only to speed up process - if (j > kStart){kStart = j;} - - if (DEBUG){ - //out.printf("j[%s],k[%s,%s]\t",j,kStart,kStop); - } - - //Update likelihoods - for (int k = kStart; k <= kStop; k++){ - - //check if allele 2 overlaps current position - if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){ - pos_k = loc - HLAstartpos[k]; - c2 = Character.toString(Character.toUpperCase(HLAreads.get(k).charAt((int) pos_k))); - - //updates likelihoods for both permutations of the alleles, normalized to the likelihood for homozygous reference - if (Scores.containsKey(c1 + c2)){ - //out.printf("j[%s],k[%s],g[%s],s[%.0f]\t",j,k,c1+c2,Scores.get(c1 + c2)); - if (!homref.equals(c1+c2) || Double.parseDouble((String) Scores.get(homref).toString()) != 0){ - likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString()); - LikelihoodScores[j][k] = LikelihoodScores[j][k] + likelihood; - LOD[j][k]= LOD[j][k] + likelihood - homreflikelihood; - } - } else{ - if (DEBUG){ - //out.printf("\nCharacters [%s] not found,j[%s],k[%s],%s,%s\n",c1+c2,j,k,HLAnames.get(j),HLAnames.get(k)); - } - } - - } - } - } - } - if ( DEBUG ){ - //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes - out.printf("Likelihoods %s%s[%5.1f] %s%s[%5.1f]\t",r1,r2,LikelihoodScores[j1][k1],s1,s2,LikelihoodScores[j2][k2]); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - out.printf("%s[%5.1f] ",g.toString(),Scores.get(g.toString())); - } - out.printf("\n"); - } - } - } - return context.getReads().size(); - } - - - private void UpdateCorrelation(SAMRecord read, boolean PRINT){ - //Updates correlation table with SNPs from specific read (for phasing) - String s = CigarFormatted(read.getCigarString(), read.getReadString()); - ArrayList SNPsInRead = new ArrayList(); - ArrayList readindex = new ArrayList(); - Hashtable indexer = new Hashtable(); - - indexer.put('A', (Integer) 0); - indexer.put('C', (Integer) 1); - indexer.put('G', (Integer) 2); - indexer.put('T', (Integer) 3); - indexer.put('D', (Integer) 4); // D for deletion - - - int readstart = read.getAlignmentStart(); - - //Find all SNPs in read - for (int i = read.getAlignmentStart(); i <= read.getAlignmentEnd(); i++){ - if (SNPs.containsKey(String.valueOf(i))){ - //Stores matrix index - SNPsInRead.add((Integer) SNPs.get(String.valueOf(i))); - - //stores position along read - readindex.add((Integer) i - readstart); - } - } - - //Update correlation table; for each combination of SNP positions - for (int i = 0; i < SNPsInRead.size(); i++){ - for (int j = i+1; j < SNPsInRead.size(); j ++){ - char c1 = s.charAt((int) readindex.get(i)); - char c2 = s.charAt((int) readindex.get(j)); - - if (indexer.get(c1) != null && indexer.get(c2) != null){ - - int a = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(c1); - int b = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(c2); - if (PRINT){ - out.printf("ReadIndex[%s,%s] of %s\t", readindex.get(i),readindex.get(j),read.getAlignmentEnd()-readstart); - out.printf("SNP#[%s,%s] of %s\tPOS:%s[%s]\t%s[%s]\t", SNPsInRead.get(i),SNPsInRead.get(j), SNPs.size(), readindex.get(i)+readstart,c1, readindex.get(j)+readstart,c2); - out.printf("MatrixIndex[%s,%s] of %s",a,b,SNPcorrelation.length); - out.printf("\tc1=%s,c2=%s",c1,c2); - out.printf("\ta=%s,b=%s",a,b); - out.printf("\tSNPcorrelation[a][b]=%s\n",SNPcorrelation[a][b]); - } - SNPcorrelation[a][b]+=1; - } - } - } - } - - - private int GetPhaseScore(int alleleindex){ - int score = 0; - ArrayList SNPsInRead = new ArrayList(); - ArrayList readindex = new ArrayList(); - Hashtable indexer = new Hashtable(); - indexer.put('A', (Integer) 0); - indexer.put('C', (Integer) 1); - indexer.put('G', (Integer) 2); - indexer.put('T', (Integer) 3); - indexer.put('D', (Integer) 4); // D for deletion - - //Get HLA allele sequence and position given index - String allele = HLAreads.get(alleleindex); - int allelestart = HLAstartpos[alleleindex], allelestop = HLAstoppos[alleleindex]; - - //Finds SNPs in allele - for (int i = allelestart; i <= allelestop; i++){ - if (SNPs.containsKey(String.valueOf(i))){ - //Stores matrix index - SNPsInRead.add((Integer) SNPs.get(String.valueOf(i))); - //stores position along read - readindex.add((Integer) i - allelestart); - } - } - - //sum score for every pair of SNPs in the allele - for (int i = 0; i < SNPsInRead.size(); i++){ - for (int j = i+1; j < SNPsInRead.size(); j ++){ - char c1 = allele.charAt((int) readindex.get(i)); - char c2 = allele.charAt((int) readindex.get(j)); - if (indexer.get(c1) != null && indexer.get(c2) != null){ - int a = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(c1); - int b = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(c2); - score += SNPcorrelation[a][b]; - } - } - } - return score; - } - - private double GetPhaseProbability(int alleleindex){ - double prob = 1; - ArrayList SNPsInRead = new ArrayList(); - ArrayList readindex = new ArrayList(); - ArrayList Genotypes = new ArrayList(); - Hashtable indexer = new Hashtable(); - indexer.put('A', (Integer) 0); - indexer.put('C', (Integer) 1); - indexer.put('G', (Integer) 2); - indexer.put('T', (Integer) 3); - //indexer.put('D', (Integer) 4); // D for deletion - - //Get HLA allele sequence and position given index - String allele = HLAreads.get(alleleindex); - int allelestart = HLAstartpos[alleleindex], allelestop = HLAstoppos[alleleindex]; - - //Finds SNPs in allele - for (int i = allelestart; i <= allelestop; i++){ - if (SNPs.containsKey(String.valueOf(i))){ - //Stores matrix index - SNPsInRead.add((Integer) SNPs.get(String.valueOf(i))); - //stores position along read - readindex.add((Integer) i - allelestart); - //stores genotypes at SNPs - Genotypes.add(SNPchars.get((Integer) SNPs.get(String.valueOf(i))-1)); - } - } - - char c1, c2, gi1, gi2, gj1, gj2; - int a, b, a1, a2, b1, b2; - double numerator, denominator; - //sum score for every pair of SNPs in the allele - for (int i = 0; i < SNPsInRead.size()-1; i++){ - int j = i + 1; - c1 = allele.charAt((int) readindex.get(i)); - c2 = allele.charAt((int) readindex.get(j)); - - gi1 = Genotypes.get(i).toCharArray()[0]; - gi2 = Genotypes.get(i).toCharArray()[1]; - gj1 = Genotypes.get(j).toCharArray()[0]; - gj2 = Genotypes.get(j).toCharArray()[1]; - - numerator = 0; denominator = 0; - if (indexer.get(c1) != null && indexer.get(c2) != null){ - a = (SNPsInRead.get(i)-1)*5; - b = (SNPsInRead.get(j)-1)*5; - for (int k = 0; k < 5; k++){ - for (int l = 0; l < 5; l++){ - if (DEBUG){}//out.printf("[%s,%s]=%s, sum=%s\n",a+k,b+l,SNPcorrelation[a+k][b+l],denominator);} - denominator = denominator + SNPcorrelation[a+k][b+l]; - } - } - if (denominator > 0){ - //indicies for genotypes at the 2 SNPs - a1 = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(gi1); - b1 = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(gj1); - a2 = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(gi2); - b2 = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(gj2); - - if (gi1 == gi2 && gj1 == gj2){ - if (c1 == gi1 && c2 == gj1){ - numerator = SNPcorrelation[a1][b1]; - } - } else if ((c1 == gi1 && c2 == gj1) || (c1 == gi2 && c2 == gj2)){ - numerator = SNPcorrelation[a1][b1] + SNPcorrelation[a2][b2]; - } else if((c1 == gi2 && c2 == gj1) || (c1 == gi1 && c2 == gj2)){ - numerator = SNPcorrelation[a2][b1] + SNPcorrelation[a1][b2]; - } else { - if ((SNPcorrelation[a1][b1] + SNPcorrelation[a2][b2]) > (SNPcorrelation[a2][b1] + SNPcorrelation[a1][b2])){ - numerator = denominator - (SNPcorrelation[a1][b1] + SNPcorrelation[a2][b2]); - }else{ - numerator = denominator - (SNPcorrelation[a2][b1] + SNPcorrelation[a1][b2]); - } - } - - if (numerator == 0){ - prob = 0.01; - }else{ - prob = prob * numerator/denominator; - } - - if (DEBUG){ - out.printf("%s: %s,%s\tC1,C2=[%s,%s]\t[%s%s]=%s\t[%s%s]=%s\t[%s%s]=%s\t[%s%s]=%s\t%s/%s=%.3f\tprob=%.3f\n",readindex.get(i)+allelestart,readindex.get(j)+allelestart,alleleindex,c1,c2,gi1,gj1,SNPcorrelation[a1][b1],gi1,gj2,SNPcorrelation[a1][b2],gi2,gj1,SNPcorrelation[a2][b1],gi2,gj2,SNPcorrelation[a2][b2],numerator,denominator,numerator/denominator,prob); - } - } - } - } - return prob; - } - - public Pair reduce(Integer value, Pair sum) { - long left = value.longValue() + sum.getFirst(); - long right = sum.getSecond() + 1l; - return new Pair(left, right); - } - - public void onTraversalDone(Pair result) { - //Print HLA allele combinations with highest likelihood sums - if (!PrintedOutput){ - out.print("\nDone calculating likelihoods\n"); - - ArrayList TopAlleles = new ArrayList(); - - double maxA = 0; int i_maxA =0; int j_maxA = 0; double maxAphase = 0.0; double maxAfreq = 0.0; double maxlikelihoodA = 0.0; double maxProbA = 0.0; - double maxA2 = 0; int i_maxA_2 =0; int j_maxA_2 = 0; - - double maxB = 0; int i_maxB =0; int j_maxB = 0; double maxBphase = 0.0; double maxBfreq = 0.0; double maxlikelihoodB = 0.0; double maxProbB = 0.0; - double maxB2 = 0; int i_maxB_2 =0; int j_maxB_2 = 0; - - double maxC = 0; int i_maxC =0; int j_maxC = 0; double maxCphase = 0.0; double maxCfreq = 0.0; double maxlikelihoodC = 0.0; double maxProbC = 0.0; - double maxC2 = 0; int i_maxC_2 =0; int j_maxC_2 = 0; - - - out.print("Finding allele pair with highest likelihood\n"); - //Find the maximum likelihood scores for each HLA gene, - for (int i = 0; i < numHLAlleles; i++){ - for (int j = i; j < numHLAlleles; j++){ - //Print likelihoods for all alleles - if (DEBUG){}//out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),LOD[i][j]);} - if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if (LOD[i][j] > maxA){ - maxA2 = maxA; i_maxA_2 = i_maxA; j_maxA_2 = j_maxA; - maxA = LOD[i][j]; i_maxA = i; j_maxA = j; maxlikelihoodA = LikelihoodScores[i][j]; - } - } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if (LOD[i][j] > maxB){ - maxB2 = maxB; i_maxB_2 = i_maxB; j_maxB_2 = j_maxB; - maxB = LOD[i][j]; i_maxB = i; j_maxB = j; maxlikelihoodB = LikelihoodScores[i][j]; - } - } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if (LOD[i][j] > maxC){ - maxC2 = maxC; i_maxC_2 = i_maxC; j_maxC_2 = j_maxC; - maxC = LOD[i][j]; i_maxC = i; j_maxC = j; maxlikelihoodC = LikelihoodScores[i][j]; - } - } - } - } - - - //Record alleles with the highest likelihood combinations, sum likelihoods (within 5 orders of magnitide of best score) to calculate posterior probabilities for each allele combination - for (Integer i = 0; i < numHLAlleles; i++){ - for (Integer j = i; j < numHLAlleles; j++){ - if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1 && maxA > 0){ - if (maxA - LOD[i][j] <= 10 && maxA >= LOD[i][j]){ - inverseMaxProbA = inverseMaxProbA + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA); - if (!TopAlleles.contains(i)){TopAlleles.add(i);} - if (!TopAlleles.contains(j)){TopAlleles.add(j);} - if (DEBUG){ - out.printf("HLA-A: %s, %s \tlikelihood=%.2f\tmax=%.2f\tLOD=%.2f\tmaxLOD=%.2f\tdelta_likelihood=%.2f\tinvP=%.2f\n",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodA,LOD[i][j],maxA,LikelihoodScores[i][j]-maxlikelihoodA,inverseMaxProbA); - } - } - } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1 && maxB > 0){ - if (maxB - LOD[i][j] <= 10 && maxB - LOD[i][j] >= 0){ - inverseMaxProbB = inverseMaxProbB + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB); - if (!TopAlleles.contains(i)){TopAlleles.add(i);} - if (!TopAlleles.contains(j)){TopAlleles.add(j);} - if (DEBUG){ - out.printf("HLA-B: %s, %s \tlikelihood=%.2f\tmax=%.2f\tLOD=%.2f\tmaxLOD=%.2f\tdelta_likelihood=%.2f\tinvP=%.2f\n",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodB,LOD[i][j],maxB,LikelihoodScores[i][j]-maxlikelihoodB,inverseMaxProbB); - } - } - } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1 && maxC > 0){ - if (maxC - LOD[i][j] <= 10 && maxC - LOD[i][j] >= 0){ - inverseMaxProbC = inverseMaxProbC + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC); - if (!TopAlleles.contains(i)){TopAlleles.add(i);} - if (!TopAlleles.contains(j)){TopAlleles.add(j);} - if (DEBUG){ - out.printf("HLA-C: %s, %s \tlikelihood=%.2f\tmax=%.2f\tLOD=%.2f\tmaxLOD=%.2f\tdelta_likelihood=%.2f\tinvP=%.2f\n",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodC,LOD[i][j],maxC,LikelihoodScores[i][j]-maxlikelihoodC,inverseMaxProbC); - } - } - - } - } - } - - out.printf("\nCalculating SNP correlation matrix for %s SNPs\n",SNPcount); - SNPcorrelation = new int[SNPs.size()*5][SNPs.size()*5]; //keep track of counts of each pair of SNPs - SNPcorrelationProb = new double[SNPs.size()][3]; // keep track of probabilities for specific haplotype at 2 SNPs. - - - - //Create correlation matrix and update correlation scores for all reads - for (int i = 0; i < AllReads.size(); i++){ - if (i == 2045){ - UpdateCorrelation(AllReads.get(i), false); - }else{ - UpdateCorrelation(AllReads.get(i), false); - } - //out.printf("[%s,%s]\n", ((Integer) SNPs.get("31431982")) * 4,((Integer) SNPs.get("31432003")) * 4 ); - //out.printf("%s\t[%s,%s]\t31431982[A] 31432003[A]\t%s\n",i,((Integer) SNPs.get("31431982")) * 4,((Integer) SNPs.get("31432003")) * 4 , SNPcorrelation[((Integer) SNPs.get("31431982")) * 4][((Integer) SNPs.get("31432003")) * 4]); - } - - Hashtable indexer = new Hashtable(); - indexer.put('A', (Integer) 0); - indexer.put('C', (Integer) 1); - indexer.put('G', (Integer) 2); - indexer.put('T', (Integer) 3); - indexer.put('D', (Integer) 4); // D for deletion - char[] bases = {'A','C','G','T','D'}; - - if ( false ){ - //prints entries in the correlation matrix that are > 0 - out.print("\n"); - for (int i = 0; i < SNPs.size(); i++){ - int loc1 = SNPlocations.get(i); - for (int j = i ; j < SNPs.size(); j++){ - int loc2 = SNPlocations.get(j); - for (char c1 : bases){ - for (char c2 : bases){ - int a = i*5 + (Integer) indexer.get(c1); - int b = j*5 + (Integer) indexer.get(c2); - if (SNPcorrelation[a][b] > 0){ - out.printf("[i,j]=[%s,%s]\t[a,b]=[%s,%s]\tPOS:%s[%s],%s[%s]\tCorr=%s\n",i,j,a,b,loc1,c1,loc2,c2,SNPcorrelation[a][b]); - } - } - } - } - } - } - - - int k, readstart, readstop, allelestart, allelestop, pos_k; - out.printf("Calculating phase scores for %s top alleles\n",TopAlleles.size()); - - //Calculate Phase score for each allele - double[] SinglePhaseScores = new double[TopAlleles.size()]; - for (int i = 0; i < TopAlleles.size(); i++){ - //SinglePhaseScores[i] = GetPhaseScore(TopAlleles.get(i)); - SinglePhaseScores[i] = GetPhaseProbability(TopAlleles.get(i)); - - if ( DEBUG ){ - //Debugging: print list of alleles to be checked for phasing - out.printf("index=%s\t%s\tscore=%.3f\n",TopAlleles.get(i),HLAnames.get(TopAlleles.get(i)),SinglePhaseScores[i]); - } - } - - out.print("Calculating phasing score for pairs of alleles\n"); - //Calculate phasing score and population frequencies for pairs of alleles, and find pairs with the highest scores, and sum combined probabilities - String alleleA, alleleB; - Double freq1 = 0.0, freq2 = 0.0; - Double ProbSumA = 0.0, ProbSumB = 0.0, ProbSumC = 0.0, likelihoodPrior; - Double PhaseSumA = 0.0, PhaseSumB = 0.0, PhaseSumC = 0.0; - for (Integer i = 0; i < numHLAlleles; i++){ - for (Integer j = i; j < numHLAlleles; j++){ - if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if ((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ - PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; - if (PhasingScores[i][j] > maxAphase){maxAphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} - SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; - if (CombinedAlleleFrequencies[i][j] > maxAfreq){maxAfreq = CombinedAlleleFrequencies[i][j];} - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA; - ProbSumA = ProbSumA + likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]; - PhaseSumA = PhaseSumA + PhasingScores[i][j]; - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbA){maxProbA = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];} - } - } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if ((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ - PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; - if (PhasingScores[i][j] > maxBphase){maxBphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} - SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; - if (freq1*freq2 > maxBfreq){maxBfreq = freq1*freq2;} - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB; - ProbSumB = ProbSumB + likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]; - PhaseSumB = PhaseSumB + PhasingScores[i][j]; - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbB){maxProbB = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];} - } - } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if ((LOD[i][j] >= maxC - 10)&& LOD[i][j] > 0){ - PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; - if (PhasingScores[i][j] > maxCphase){maxCphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} - SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; - if (freq1*freq2 > maxCfreq){maxCfreq = freq1*freq2;} - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC; - ProbSumC = ProbSumC + likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]; - PhaseSumC = PhaseSumC + PhasingScores[i][j]; - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbC){maxProbC = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];} - if (DEBUG){ - out.printf("DEBUG: %s\t%s\tPhaseScore[%.3f]\t%.3f\n",alleleA,alleleB,PhasingScores[i][j],PhaseSumC); - } - } - } - } - } - - if (DEBUG){ - out.printf("DEBUG: Phase sum for HLA-C: %.3f\n",PhaseSumC); - } - out.print("Printing results ...\n"); - //Print allele pairs with highest likelihood score and highest phasing score - for (Integer i = 0; i < numHLAlleles; i++){ - for (Integer j = i; j < numHLAlleles; j++){ - if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA; - //out.printf("%s\t%s\tloglikelihood=%5.0f\tmax=%5.0f\tinvP=%.2f\tPrior=%.3f\tPhase=%.3f\tfreq1=%s\tfreq2=%s\tf1*f2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodA,inverseMaxProbA,likelihoodPrior,PhasingScores[i][j],SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]/ProbSumA); - out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumA,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumA); - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbA){out.printf("\tBEST");} - out.printf("\n"); - } - } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB; - out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumB,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumB); - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbB){out.printf("\tBEST");} - out.printf("\n"); - } - } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if((LOD[i][j] >= maxC - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase - likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC; - out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumC,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumC); - if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbC){out.printf("\tBEST");} - out.printf("\n"); - } - } - } - } - - //2nd Highest likelihoods - for (int i = 0; i < numHLAlleles; i++){ - for (int j = i; j < numHLAlleles; j++){ - if (LOD[i][j] == maxA2){ - //out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",maxA2,HLAnames.get(i),HLAnames.get(j),i,j); - } - } - } - - PrintedOutput = true; - //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java deleted file mode 100644 index 5dc0e22fa..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -/** - * Returns formatted read given read string and cigar string - * Essentially removes header bases, soft clipped bases, and currently removes insertions - * Deletions coded as "D" - * - * @author shermanjia - */ -public class CigarParser { - String formattedRead; - - public String GetFormattedRead(){ - return formattedRead; - } - - - public String FormatRead(String cigar, String read){ - // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur - formattedRead = ""; - - char c; String count; - int cigarPlaceholder = 0; int subcigarLength = 0; - int readPlaceholder = 0; int subreadLength = 0; - - //reads cigar string - for (int i = 0; i < cigar.length(); i++){ - c = cigar.charAt(i); - if (c == 'M'){ - //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence - subcigarLength = i-cigarPlaceholder; - count = cigar.substring(cigarPlaceholder, i); - - subreadLength = Integer.parseInt(count); - formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength); - - //increment placeholders - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'I'){ - //***NOTE: To be modified later if needed (insertions removed here)*** - - //If reaches I for insertion, get number before 'I' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment placeholders without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'H'){ - //(H = hard clip) - - //If reaches H for hard clip, simply carry on - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - } else if (c == 'S'){ - //(S = Soft clipped bases discarded here)*** - - //If reaches S for soft clip, get number before 'S' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'D'){ - //If reaches D for deletion, insert 'D' into sequence as placeholder - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //Add one 'D' for each deleted base - String deletion = ""; - for (int j = 1; j <= subreadLength; j++){ - deletion = deletion + "D"; - } - - //update placeholders - formattedRead = formattedRead + deletion; - cigarPlaceholder = i+1; - } - - } - return formattedRead; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java deleted file mode 100644 index d16d4d1d6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java +++ /dev/null @@ -1,332 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.io.PrintStream; - -/** - * Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class ClusterReadsWalker extends ReadWalker { - @Output - public PrintStream out; - - @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) - public String filterFile = ""; - - @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false) - public int MAXALLOWEDMISMATCHES = 7; - - @Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false) - public int MINREQUIREDMATCHES = 0; - - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; - - String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; - - boolean DatabaseLoaded = false; - boolean DEBUG = false; - - Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; - double[] SingleAlleleFrequencies; - ArrayList ReadsToDiscard = new ArrayList(); - ArrayList AlignedReads = new ArrayList(); - - Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable(); - - double[] nummatched, concordance, numcompared; - - SAMRecord A2, A3, B2, B3, C2, C3; - int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0; - int A2start = 30018513, A2stop = 30018781; - int A3start = 30019024, A3stop = 30019300; - - int C2start = 31347355, C2stop = 31347623; - int C3start = 31346829, C3stop = 31347104; - - int B2start = 31432444, B2stop = 31432714; - int B3start = 31431923, B3stop = 31432198; - - int minstartpos = 0; - int maxstoppos = 0; - - - Hashtable AlleleFrequencies = new Hashtable(); - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - CigarParser formatter = new CigarParser(); - - public Integer reduceInit() { - if (!DatabaseLoaded){ - DatabaseLoaded = true; - - PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader(); - siteFileReader.ReadFile(PolymorphicSitesFile); - PolymorphicSites = siteFileReader.GetPolymorphicSites(); - NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); - - - out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); - - if (!filterFile.equals("")){ - out.printf("INFO Reading properties file ... "); - SimilarityFileReader similarityReader = new SimilarityFileReader(); - similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES); - ReadsToDiscard = similarityReader.GetReadsToDiscard(); - MaxNumMatches = similarityReader.GetNumMatches(); - MaxConcordance = similarityReader.GetConcordance(); - - out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); - for (int i = 0; i < ReadsToDiscard.size(); i++){ - out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString()); - } - } - - out.printf("INFO Comparing reads ...\n"); - - if (DEBUG){ - //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); - } - } - return 0; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - //Calculate concordance for this read and all overlapping reads - if (!ReadsToDiscard.contains(read.getReadName())){ - AlignedReads.add(read); - - int readstart = read.getAlignmentStart(); - int readstop = read.getAlignmentEnd(); - int length = readstop - readstart + 1; - if (MaxNumMatches.containsKey(read.getReadName())){ - int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString()); - double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString()); - - if (readstart < A2stop && readstop > A2start){ - if (maxMatches > MaxMatchesA2 && concordance > 0.95){ - MaxMatchesA2 = maxMatches; - A2 = read; - } - } else if (readstart < A3stop && readstop > A3start){ - if (maxMatches > MaxMatchesA3){ - MaxMatchesA3 = maxMatches; - A3 = read; - } - } else if (readstart < B2stop && readstop > B2start){ - if (maxMatches > MaxMatchesB2){ - MaxMatchesB2 = maxMatches; - B2 = read; - } - } else if (readstart < B3stop && readstop > B3start){ - if (maxMatches > MaxMatchesB3){ - MaxMatchesB3 = maxMatches; - B3 = read; - } - } else if (readstart < C2stop && readstop > C2start){ - if (maxMatches > MaxMatchesC2){ - MaxMatchesC2 = maxMatches; - C2 = read; - } - } else if (readstart < C3stop && readstop > C3start){ - if (maxMatches > MaxMatchesC3){ - MaxMatchesC3 = maxMatches; - C3 = read; - } - } - }else{ - out.printf("Data for %s not found\n",read.getReadName()); - } - } - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - - return value + sum; - } - - public void onTraversalDone(Integer numreads) { - SAMRecord read; String name, name2; String locus = ""; - int A2a = 0, A2b = 0, A2c = 0; - int A3a = 0, A3b = 0, A3c = 0; - int B2a = 0, B2b = 0, B2c = 0; - int B3a = 0, B3b = 0, B3c = 0; - int C2a = 0, C2b = 0, C2c = 0; - int C3a = 0, C3b = 0, C3c = 0; - double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1; - double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0; - double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0; - double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0; - double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0; - double a2, a3, b2, b3, c2, c3, normalized = 0; - int readstart, readstop; - double matches, compared, concordance; - STATS stats; - - for (int i = 0; i < AlignedReads.size(); i++){ - read = AlignedReads.get(i); - readstart = read.getAlignmentStart(); - readstop = read.getAlignmentEnd(); - if (readstart < A2stop && readstop > A2start){ - stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;} - } else if (readstart < A3stop && readstop > A3start){ - stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;} - } else if (readstart < B2stop && readstop > B2start){ - stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;} - } else if (readstart < B3stop && readstop > B3start){ - stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;} - } else if (readstart < C2stop && readstop > C2start){ - stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;} - } else if (readstart < C3stop && readstop > C3start){ - stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); - if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;} - } - } - - - for (int i = 0; i < AlignedReads.size(); i++){ - read = AlignedReads.get(i); - readstart = read.getAlignmentStart(); - readstop = read.getAlignmentEnd(); - name = read.getReadName(); name2 = ""; - if (NumCompared.containsKey(name)){ - compared = Double.parseDouble((String) NumCompared.get(name).toString()); - matches = Double.parseDouble((String) MaxNumMatches.get(name).toString()); - concordance = Double.parseDouble((String) MaxConcordance.get(name).toString()); - if (matches > 40){ - if (readstart < A2stop && readstop > A2start){ - locus = "A2"; name2 = A2.getReadName(); - a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2; - } else if (readstart < A3stop && readstop > A3start){ - locus = "A3"; name2 = A3.getReadName(); - a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3; - } else if (readstart < B2stop && readstop > B2start){ - locus = "B2"; name2 = B2.getReadName(); - b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2; - } else if (readstart < B3stop && readstop > B3start){ - locus = "B3"; name2 = B3.getReadName(); - b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3; - } else if (readstart < C2stop && readstop > C2start){ - locus = "C2"; name2 = C2.getReadName(); - c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2; - } else if (readstart < C3stop && readstop > C3start){ - locus = "C3"; name2 = C3.getReadName(); - c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3; - } - out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized); - }else{ - out.printf("%s (compared at %s sites) is too short\n",name,matches); - } - }else{ - out.printf("%s [%s to %s] not found\n",name,readstart,readstop); - } - } - - if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;} - if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;} - if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;} - if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;} - if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;} - if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;} - - out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d); - out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d); - out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d); - out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d); - out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d); - out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d); - - } - - public class STATS { - protected double concordance = 0.0; - protected double numcompared = 0; - protected double nummatches = 0; - - public STATS(double d, double i, double m) { - concordance = d; - numcompared = i; - nummatches = m; - } - - public double getConcordance() { - return concordance; - } - - public double getNumCompared() { - return numcompared; - } - - public double getNumMatches() { - return nummatches; - } - } - - private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){ - int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd(); - int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd(); - - int pos; - double numcompared = 0, nummatched = 0, concordance; - char c1, c2; - String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString()); - String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString()); - - - //Polymorphic sites: always increment denominator, increment numerator when bases are concordant - for (int j = 0; j < PolymorphicSites.length; j++){ - pos = PolymorphicSites[j]; - if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){ - c1 = s1.charAt(pos-start1); - c2 = s2.charAt(pos-start2); - if (c1 != 'D'){//allow for deletions (sequencing errors) - numcompared++; - if (c1 == c2){ - nummatched++; - } - } - } - } - - //Non-polymorphic sites: increment denominator only when bases are discordant - if (false){ - for (int j = 0; j < NonPolymorphicSites.length; j++){ - pos = NonPolymorphicSites[j]; - if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){ - c1 = s1.charAt(pos-start1); - c2 = s2.charAt(pos-start2); - if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors) - numcompared++; - } - } - } - } - - //Update concordance array - concordance=nummatched/numcompared; - - MaxNumMatches.put(read1.getReadName(), nummatched); - NumCompared.put(read1.getReadName(), numcompared); - MaxConcordance.put(read1.getReadName(), concordance); - //out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance); - - return new STATS(concordance, numcompared, nummatched); - } - -} - - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java deleted file mode 100644 index 0b63d3da9..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Output; - -import java.util.Hashtable; -import java.io.PrintStream; - -/** - * Creates a haplotype file given reads (for SNP analysis, imputation, etc) - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class CreateHaplotypesWalker extends ReadWalker { - @Output - PrintStream out; - - CigarParser formatter = new CigarParser(); - char c; - boolean DEBUG = false; - int HLA_A_start = 30018310; - int HLA_A_end = 30021211; - String[] SNPnames; - String SNPname; - int start, end; - - Hashtable indexer = new Hashtable(); - - public Integer reduceInit() { - SNPnames = new String[HLA_A_end-HLA_A_start+1]; - start = HLA_A_start; - end = HLA_A_end; - - indexer.put('A', (Integer) 1); - indexer.put('C', (Integer) 2); - indexer.put('G', (Integer) 3); - indexer.put('T', (Integer) 4); - indexer.put('D', (Integer) 5); // D for deletion - out.print("Reads:\n"); - return 0; - } - - - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - int readstart = read.getAlignmentStart(); - int readstop = read.getAlignmentEnd(); - - if(readstart <= HLA_A_end && readstop >= HLA_A_start){ - String s = formatter.FormatRead(read.getCigarString(),read.getReadString()); - String name = read.getReadName(); - - out.printf("%s->%s HAPLO01 ",name,name); - - for (int i = start; i <= end; i++){ - - if (i - readstart < s.length()){ - c = s.charAt(i-readstart); - out.printf("%s",indexer.get(c)); - }else{ - out.print("5"); - } - } - out.printf("\n"); - } - return 1; - } - - - - public Integer reduce(Integer value, Integer sum) { - - return value + sum; - } - - public void onTraversalDone(Integer value) { - out.print("\nSNP names:\n"); - for (int pos = start; pos <= end; pos++){ - SNPname = "CHR6_POS" + String.valueOf(pos); - SNPnames[pos-start]=SNPname; - out.printf("%s\n",SNPname); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java deleted file mode 100644 index e1f28ece3..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java +++ /dev/null @@ -1,520 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.Enumeration; -import java.io.PrintStream; - -/** - * Creates a ped file of SNPs and amino acids coded as SNPs given an input ped file with 4-digit HLA alleles. Usage: java -jar GenomeAnalysisTK.jar -T CreatePedFile --allelesFile INPUT.ped -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-sc\ -r1/GSA/sjia/454_HLA/HLA/HLA.combined.4digitUnique.bam > OUTPUT.log - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class CreatePedFileWalker extends ReadWalker { - @Output - public PrintStream out; - - @Argument(fullName = "allelesFile", shortName = "allelesFile", doc = "Create ped file for HLA alleles named in this file", required = true) - public String alleleNamesFile = ""; - - @Argument(fullName = "pedIntervals", shortName = "pedIntervals", doc = "Create genotypes in these intervals", required = false) - public String pedIntervalsFile = ""; - - @Argument(fullName = "HLAexonIntervals", shortName = "HLAexonIntervals", doc = "HLA exonic intervals", required = false) - public String exonIntervalsFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_EXON_POSITIONS.txt"; - - @Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false) - public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt"; - - @Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false) - public boolean PrintDNA = false; - - @Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false) - public boolean PrintAA = true; - - String[] HLAnames, HLAreads, inputFileContents; - Integer[] HLAstartpos, HLAstoppos; - ArrayList HLAnamesAL, HLAreadsAL; - ArrayList HLAstartposAL, HLAstopposAL; - int[][] intervals; String[][] exonIntervals; - int numIntervals; - - ReadCigarFormatter formatter = new ReadCigarFormatter(); - char c; - boolean DEBUG = false; - boolean FilesLoaded = false; - int HLA_A_start = 30018310, HLA_A_end = 30021211; - int HLA_C_start = 31344925, HLA_C_end = 31347827; - int HLA_B_start = 31430239, HLA_B_end = 31432914; - int HLA_DRB1_start = 32654846, HLA_DRB1_end = 32665497; - int HLA_DQA1_start = 32713214, HLA_DQA1_end = 32718519; - int HLA_DQB1_start = 32735991, HLA_DQB1_end = 32742362; - int HLA_DPA1_start = 33144405, HLA_DPA1_end = 33149325; - int HLA_DPB1_start = 33151797, HLA_DPB1_end = 33161993; - - - String[] SNPnames; - String SNPname; - int start, end; - Integer I; - - Hashtable indexer = new Hashtable(); - Hashtable DNAcode = new Hashtable(); - - public Integer reduceInit() { - if (!FilesLoaded){ - FilesLoaded = true; - HLAnamesAL = new ArrayList(); - HLAreadsAL = new ArrayList(); - HLAstartposAL = new ArrayList(); - HLAstopposAL = new ArrayList(); - - TextFileReader fileReader = new TextFileReader(); - fileReader.ReadFile(alleleNamesFile); - inputFileContents = fileReader.GetLines(); - - - //Determine intervals - if (!pedIntervalsFile.equals("")){ - fileReader = new TextFileReader(); - fileReader.ReadFile(pedIntervalsFile); - String[] lines = fileReader.GetLines(); - intervals = new int[lines.length][2]; - for (int i = 0; i < lines.length; i++) { - String[] s = lines[i].split(":"); - String[] intervalPieces = s[0].split("-"); - intervals[i][0] = Integer.valueOf(intervalPieces[0]); - intervals[i][1] = Integer.valueOf(intervalPieces[1]); - } - numIntervals = intervals.length; - for (int i = 0; i < numIntervals; i++){ - out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]); - } - } - - //load HLA exonic intervals - if (!exonIntervalsFile.equals("")){ - fileReader = new TextFileReader(); - fileReader.ReadFile(exonIntervalsFile); - String[] lines = fileReader.GetLines(); - exonIntervals = new String[lines.length][5]; - for (int i = 0; i < lines.length; i++) { - String[] s = lines[i].split("\t"); - String[] intervalPieces = s[1].split("-"); - exonIntervals[i][1] = intervalPieces[0]; - exonIntervals[i][2] = intervalPieces[1]; - exonIntervals[i][0] = s[0]; // Locus - exonIntervals[i][3] = s[2]; // Exon number - exonIntervals[i][4] = s[3]; // +/- strand - } - numIntervals = exonIntervals.length; - for (int i = 0; i < numIntervals; i++){ - out.printf("INFO HLA-%s %s (%s): %s-%s\n",exonIntervals[i][0],exonIntervals[i][3],exonIntervals[i][4],exonIntervals[i][1],exonIntervals[i][2]); - } - } - - //load amino-acid coding DNA triplets - if (!dnaCodesFile.equals("")){ - fileReader = new TextFileReader(); - fileReader.ReadFile(dnaCodesFile); - String[] lines = fileReader.GetLines(); - for (int i = 0; i < lines.length; i++) { - String[] s = lines[i].split("\t"); - DNAcode.put(s[0],s[1]); - } - - Enumeration e = DNAcode.keys(); - while( e.hasMoreElements() ){ - String key = e.nextElement().toString(); - out.printf("INFO %s encodes %s\n",key,DNAcode.get(key)); - } - } - } - return 0; - } - - private String[][] GetExonIntervals(String locus, boolean isForwardStrand){ - int numExons = 0; int exonNum; - for (int i = 0; i < exonIntervals.length; i++){ - if (exonIntervals[i][0].equals(locus)){ - numExons++; - } - } - String[][] ExonIntervals = new String[numExons][5]; - if (isForwardStrand){exonNum = 1;}else{exonNum = ExonIntervals.length;} - for (int i = 0; i < exonIntervals.length; i++){ - if (exonIntervals[i][0].equals(locus)){ - ExonIntervals[exonNum-1]=exonIntervals[i]; - if (isForwardStrand){ - exonNum++; - }else{ - exonNum--; - } - } - } - return ExonIntervals; - } - - private int BaseCharToInt(char c){ - switch(c){ - case 'A': return 1; - case 'C': return 2; - case 'G': return 3; - case 'T': return 4; - default: return -1; - } - } - - private char Complement(char c){ - switch(c){ - case 'A': return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - default: return '0'; - } - } - - private char GetAminoAcid(String codon){ - if (DNAcode.containsKey(codon)){ - return DNAcode.get(codon).toString().charAt(0); - }else{ - return '0'; - } - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - HLAnamesAL.add(read.getReadName()); - HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString())); - HLAstartposAL.add(read.getAlignmentStart()); - HLAstopposAL.add(read.getAlignmentEnd()); - return 1; - } - - private String PrintGenotypes(String ID, String alleleName1, String alleleName2, int startpos, int stoppos){ - - String error = ""; - //prints genotypes for allele1 and allele2 at given interval - int i1 = GetAlleleIndex(alleleName1); - int i2 = GetAlleleIndex(alleleName2); - String s1, s2; - int start1, start2, stop1, stop2; - char c1, c2; - - if (i1 > -1){ - s1 = HLAreads[i1]; - start1 = HLAstartpos[i1]; - stop1 = HLAstoppos[i1]; - }else{ - error = error + "INFO " + alleleName1 + " for " + ID + " not found in HLA dictionary\n"; - s1 = ""; - start1 = -1; - stop1 = -1; - } - - if (i2 > -1){ - s2 = HLAreads[i2]; - start2 = HLAstartpos[i2]; - stop2 = HLAstoppos[i2]; - }else{ - error = error + "INFO " + alleleName2 + " for " + ID + " not found in HLA dictionary\n"; - s2 = ""; - start2 = -1; - stop2 = -1; - } - - for (int pos = startpos; pos <= stoppos; pos++){ - c1 = GetBase(pos,s1,start1,stop1); - c2 = GetBase(pos,s2,start2,stop2); - out.printf("\t%s %s",c1,c2); - } - return error; - } - -private String PrintAminoAcids(String ID, String alleleName1, String alleleName2, String[][] ExonIntervals){ - - String error = ""; - //prints genotypes for allele1 and allele2 at given interval - int i1 = GetAlleleIndex(alleleName1); - int i2 = GetAlleleIndex(alleleName2); - String s1, s2; - int start1, start2, stop1, stop2; - char c1, c2; - boolean isForwardStrand = false; - if (ExonIntervals[0][4].equals("+")){isForwardStrand=true;} - - int AAcount=0; - int baseCount=0; - String codon1 = ""; String codon2 = ""; - - if (i1 > -1){ - s1 = HLAreads[i1]; - start1 = HLAstartpos[i1]; - stop1 = HLAstoppos[i1]; - }else{ - s1 = ""; - start1 = -1; - stop1 = -1; - error = error + "INFO " + alleleName1 + " for " + ID + " not found in HLA dictionary\n"; - } - - if (i2 > -1){ - s2 = HLAreads[i2]; - start2 = HLAstartpos[i2]; - stop2 = HLAstoppos[i2]; - }else{ - s2 = ""; - start2 = -1; - stop2 = -1; - error = error + "INFO " + alleleName2 + " for " + ID + " not found in HLA dictionary\n"; - } - - int i; - for (int exonNum = 1; exonNum <= ExonIntervals.length; exonNum++){ - if (isForwardStrand){i=exonNum-1;}else{i=ExonIntervals.length-exonNum;} - int exonStart = Integer.parseInt(ExonIntervals[i][1]); - int exonStop = Integer.parseInt(ExonIntervals[i][2]); - for (int pos = exonStart; pos <= exonStop; pos++){ - c1 = GetBase(pos,s1,start1,stop1); - c2 = GetBase(pos,s2,start2,stop2); - if (!isForwardStrand){ - c1 = Complement(c1); - c2 = Complement(c2); - } - if (baseCount < 3){ - if (isForwardStrand){ - codon1 = codon1 + c1; - codon2 = codon2 + c2; - }else{ - codon1 = c1 + codon1; - codon2 = c2 + codon2; - } - baseCount++; - } - - if (baseCount == 3){ - out.printf("\t%s %s",GetAminoAcid(codon1),GetAminoAcid(codon2)); - baseCount = 0; - AAcount++; - codon1 = ""; - codon2 = ""; - } - } - } - if (baseCount > 0){ - //Print stop or start codon depending on strandedness - if (isForwardStrand){out.printf("\tO O");}else{out.printf("\tM M");} - } - - return error; - } - - private char GetBase(int pos, String str, int start, int stop){ - char base; - if (pos >= start && pos <= stop){ - base = str.charAt(pos-start); - if (base == 'D'){base = '0';} - }else{ - base = '0'; - } - return base; - } - - private int GetAlleleIndex(String alleleName){ - //Find first allele that matches name, or matches part of name for 2-digit allele - int i; - for (i = 0; i < HLAnames.length; i++){ - if (HLAnames[i].indexOf(alleleName) > -1){ - return i; - } - } - return -1; - - } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - private String GetAlleleName(String locus, String sep, String allele){ - if (allele.length() > 1){ - return locus + sep + allele; - }else{ - return locus + sep + "0000"; - } - } - - public void onTraversalDone(Integer numreads) { - HLAnames = HLAnamesAL.toArray(new String[numreads]); - HLAreads = HLAreadsAL.toArray(new String[numreads]); - HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]); - HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]); - String star = "*"; - String error = ""; - - //out.printf("INFO %s alleles in dictionary\n",HLAnames.length); - String[][] A_exons = GetExonIntervals("A",true); - String[][] B_exons = GetExonIntervals("B",false); - String[][] C_exons = GetExonIntervals("C",false); - String[][] DRB1_exons = GetExonIntervals("DRB1",false); - String[][] DQB1_exons = GetExonIntervals("DQB1",false); - String[][] DQA1_exons = GetExonIntervals("DQA1",true); - String[][] DPB1_exons = GetExonIntervals("DPB1",true); - String[][] DPA1_exons = GetExonIntervals("DPA1",false); - //Print individual info and genotypes - for (int i = 0; i < inputFileContents.length; i++){ - String[] s = inputFileContents[i].split(" "); - //out.printf("%s\t%s\n",inputFileContents[i],s.length); - if (s.length > 10){ - error = ""; - out.printf("%s\t%s\t%s\t%s\t%s\t%s",s[0],s[1],s[2],s[3],s[4],s[5]); - String HLA_A_1 = GetAlleleName("HLA_A",star,s[6]); - String HLA_A_2 = GetAlleleName("HLA_A",star,s[7]); - String HLA_B_1 = GetAlleleName("HLA_B",star,s[8]); - String HLA_B_2 = GetAlleleName("HLA_B",star,s[9]); - String HLA_C_1 = GetAlleleName("HLA_C",star,s[10]); - String HLA_C_2 = GetAlleleName("HLA_C",star,s[11]); - String HLA_DPA1_1 = GetAlleleName("HLA_DPA1",star,s[12]); - String HLA_DPA1_2 = GetAlleleName("HLA_DPA1",star,s[13]); - String HLA_DPB1_1 = GetAlleleName("HLA_DPB1",star,s[14]); - String HLA_DPB1_2 = GetAlleleName("HLA_DPB1",star,s[15]); - String HLA_DQA1_1 = GetAlleleName("HLA_DQA1",star,s[16]); - String HLA_DQA1_2 = GetAlleleName("HLA_DQA1",star,s[17]); - String HLA_DQB1_1 = GetAlleleName("HLA_DQB1",star,s[18]); - String HLA_DQB1_2 = GetAlleleName("HLA_DQB1",star,s[19]); - String HLA_DRB1_1 = GetAlleleName("HLA_DRB1",star,s[20]); - String HLA_DRB1_2 = GetAlleleName("HLA_DRB1",star,s[21]); - - - - if (true) { - if (PrintDNA){ - error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end); - error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end); - error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end); - error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end); - error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end); - error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end); - error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end); - error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end); - } - if (PrintAA){ - error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons); - error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons); - error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons); - error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons); - error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons); - error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons); - error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons); - error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons); - } - out.printf("\n"); - out.printf("%s",error); - } - } - } - - //Prints SNP names for each site - if (true){ - if (PrintDNA){ - PrintSNPS(HLA_A_start,HLA_A_end); - PrintSNPS(HLA_C_start,HLA_C_end); - PrintSNPS(HLA_B_start,HLA_B_end); - PrintSNPS(HLA_DRB1_start,HLA_DRB1_end); - PrintSNPS(HLA_DQA1_start,HLA_DQA1_end); - PrintSNPS(HLA_DQB1_start,HLA_DQB1_end); - PrintSNPS(HLA_DPA1_start,HLA_DPA1_end); - PrintSNPS(HLA_DPB1_start,HLA_DPB1_end); - } - - if (PrintAA){ - PrintAminoAcidSites(A_exons,"A",true); - PrintAminoAcidSites(C_exons,"C",false); - PrintAminoAcidSites(B_exons,"B",false); - PrintAminoAcidSites(DRB1_exons,"DRB1",false); - PrintAminoAcidSites(DQA1_exons,"DQA1",true); - PrintAminoAcidSites(DQB1_exons,"DQB1",false); - PrintAminoAcidSites(DPA1_exons,"DPA1",false); - PrintAminoAcidSites(DPB1_exons,"DPB1",true); - } - } - - } - - private void PrintSNPS(int startpos, int stoppos){ - for (int pos = startpos; pos <= stoppos; pos++){ - SNPname = "CHR6_POS" + String.valueOf(pos); - out.printf("6\t%s\t0\t%s\n",SNPname,pos); - } - } - - private void PrintAminoAcidSites(String[][] ExonIntervals, String locus, boolean isForwardStrand){ - int AAcount=1; int baseCount = 1; int exonNum; - - if (!isForwardStrand){ - for (int i = 1; i <= ExonIntervals.length; i++){ - int exonStart = Integer.parseInt(ExonIntervals[i-1][1]); - int exonStop = Integer.parseInt(ExonIntervals[i-1][2]); - for (int pos = exonStart; pos <= exonStop; pos++){ - if (baseCount == 3){ - AAcount++; - baseCount = 1; - }else{ - baseCount++; - } - } - } - } - - for (int i = 1; i <= ExonIntervals.length; i++){ - if (isForwardStrand){exonNum = i;}else{exonNum = ExonIntervals.length - i + 1;} - int exonStart = Integer.parseInt(ExonIntervals[exonNum-1][1]); - int exonStop = Integer.parseInt(ExonIntervals[exonNum-1][2]); - for (int pos = exonStart; pos <= exonStop; pos++){ - if (baseCount == 2){ - SNPname = locus + "_AA" + String.valueOf(AAcount) + "_E" + exonNum + "_" + String.valueOf(pos); - out.printf("6\t%s\t0\t%s\n",SNPname,pos); - } - if (baseCount == 3){ - if (isForwardStrand){AAcount++;}else{AAcount--;} - baseCount = 1; - }else{ - baseCount++; - } - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java deleted file mode 100644 index df4169a3e..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.io.PrintStream; - -/** - * Finds the most similar HLA allele for each read (helps detect misalignments). Usage: java -jar GenomeAnalysisTK.jar -T FindClosestHLA -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval | grep -v INFO | sort -k1 > OUTPUT - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class FindClosestHLAWalker extends ReadWalker { - @Output - protected PrintStream out; - - @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) - public String debugRead = ""; - - @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false) - public boolean findFirst = false; - - @Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false) - public boolean DEBUG = false; - - @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false) - public String debugAllele = ""; - - @Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false) - public String intervalFile = ""; - - @Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false) - public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam"; - - @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) - public boolean ONLYFREQUENT = false; - - @Argument(fullName = "HLAdictionary", shortName = "HLAdictionary", doc = "HLA dictionary file", required = true) - public String HLAdatabaseFile = "HLA_DICTIONARY.txt"; - - @Argument(fullName = "PolymorphicSites", shortName = "PolymorphicSites", doc = "file containing polymorphic sites within the HLA", required = true) - public String PolymorphicSitesFile = "HLA_POLYMORPHIC_SITES.txt"; - - HLAFileReader HLADictionaryReader = new HLAFileReader(); - - boolean DatabaseLoaded = false; - ArrayList ClosestAlleles = new ArrayList(); - - String[] HLAnames, HLAreads; - Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; - double[] SingleAlleleFrequencies; - - double[] nummatched, concordance, numcompared; - int numHLAlleles = 0; - int minstartpos = 0; - int maxstoppos = 0; - int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0; - - Hashtable AlleleFrequencies = new Hashtable(); - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - CigarParser formatter = new CigarParser(); - int [][] intervals; int numIntervals; - - public Integer reduceInit() { - if (!DatabaseLoaded){ - DatabaseLoaded = true; - - //Load HLA dictionary - out.printf("INFO Loading HLA dictionary ... "); - - HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetSequences(); - HLAnames = HLADictionaryReader.GetNames(); - HLAstartpos = HLADictionaryReader.GetStartPositions(); - HLAstoppos = HLADictionaryReader.GetStopPositions(); - minstartpos = HLADictionaryReader.GetMinStartPos(); - maxstoppos = HLADictionaryReader.GetMaxStopPos(); - - out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); - - nummatched = new double[HLAreads.length]; - concordance = new double[HLAreads.length]; - numcompared = new double[HLAreads.length]; - - //Load list of polymorphic sites - PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader(); - siteFileReader.ReadFile(PolymorphicSitesFile); - PolymorphicSites = siteFileReader.GetPolymorphicSites(); - NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); - numpolymorphicsites = PolymorphicSites.length; - numnonpolymorphicsites = NonPolymorphicSites.length; - - if (!intervalFile.equals("")){ - TextFileReader fileReader = new TextFileReader(); - fileReader.ReadFile(intervalFile); - String[] lines = fileReader.GetLines(); - intervals = new int[lines.length][2]; - for (int i = 0; i < lines.length; i++) { - String[] s = lines[i].split(":"); - String[] intervalPieces = s[1].split("-"); - intervals[i][0] = Integer.valueOf(intervalPieces[0]); - intervals[i][1] = Integer.valueOf(intervalPieces[1]); - } - numIntervals = intervals.length; - } - - out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",numpolymorphicsites,numnonpolymorphicsites); - out.printf("INFO Comparing reads to database ...\n"); - - if (DEBUG){ - //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); - } - } - return 0; - } - - private double CalculateConcordance(SAMRecord read){ - int readstart = read.getAlignmentStart(); - int readstop = read.getAlignmentEnd(); - char c1, c2; - double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0; - String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString()); - String s2; - int allelestart, allelestop; - - if (ONLYFREQUENT){ - minFreq = 0.0001; - } - - for (int i = 0; i < HLAreads.length; i++){ - nummatched[i] = 0; concordance[i] = 0; numcompared[i] = 0; - freq = GetAlleleFrequency(HLAnames[i]); - //Get concordance between read and specific allele - if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i] && freq > minFreq){ - s2 = HLAreads[i]; - - allelestart = HLAstartpos[i]; - allelestop = HLAstoppos[i]; - - //Polymorphic sites: always increment denominator, increment numerator when bases are concordant - for (int j = 0; j < numpolymorphicsites; j++){ - pos = PolymorphicSites[j]; - if (DEBUG == true){ - out.printf("DEBUG\tPOS\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],pos,allelestart,allelestop,IsWithin(pos,readstart,readstop), IsWithin(pos,allelestart,allelestop),IsWithinInterval(pos)); - } - if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){ - c1 = s1.charAt(pos-readstart); - c2 = s2.charAt(pos-allelestart); - if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) - numcompared[i]++; - if (c1 == c2){ - nummatched[i]++; - }else{ - if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2); - } - } - } - } - } - - //Non-polymorphic sites: increment denominator only when bases are discordant - if (numcompared[i] > 0){ - for (int j = 0; j < numnonpolymorphicsites; j++){ - pos = NonPolymorphicSites[j]; - if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){ - c1 = s1.charAt(pos-readstart); - c2 = s2.charAt(pos-allelestart); - if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) - numcompared[i]++; - if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); - } - } - } - } - } - - //Update concordance array - concordance[i]=nummatched[i]/numcompared[i]; - if (concordance[i] > maxConcordance){maxConcordance = concordance[i];} - if (DEBUG == true){ - out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]); - } - if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]); - } - if (findFirst && (concordance[i] == 1)){ - break; - } - } - - } - - return maxConcordance; - } - - private double FindMaxAlleleFrequency(double maxConcordance){ - //finds the max frequency of the alleles that share the maximum concordance with the read of interest - double freq, maxFreq = 0.0; - for (int i = 0; i < HLAreads.length; i++){ - if (concordance[i] == maxConcordance && maxConcordance > 0){ - freq = GetAlleleFrequency(HLAnames[i]); - if (freq > maxFreq){maxFreq = freq;} - } - } - return maxFreq; - } - - private boolean IsWithin(int pos, int start, int stop){ - return pos >= start && pos <= stop; - } - - private boolean IsWithinInterval(int pos){ - boolean isWithinInterval = false; - for (int i = 0; i < numIntervals; i++){ - if (pos >= intervals[i][0] && pos <= intervals[i][1]){ - isWithinInterval = true; - break; - } - } - return isWithinInterval; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - //Calculate concordance for this read and all overlapping reads - if (DEBUG == true){ - out.printf("%s\t%s\n",read.getReadName(),read.getMappingQuality()); - } - if (read.getMappingQuality() > 0 || DEBUG == true){ - double maxConcordance = CalculateConcordance(read); - String stats = "", topAlleles = ""; - if (maxConcordance > 0 || DEBUG == true){ - String readname = read.getReadName(), allelename = ""; double freq; - //For input bam files that contain HLA alleles, find and print allele frequency - out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd()); - - //Print concordance statistics between this read and the most similar HLA allele(s) - - for (int i = 0; i < HLAreads.length; i++){ - if (concordance[i] == maxConcordance){ - //freq = GetAlleleFrequency(HLAnames[i]); - if (topAlleles.equals("")){ - topAlleles = HLAnames[i]; - }else{ - topAlleles = topAlleles + "," + HLAnames[i]; - } - stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); - - } - } - out.printf("\t%s\t%s\t%s\n",stats,topAlleles,maxConcordance); - } - } - return 1; - } - - private double GetAlleleFrequency(String allelename){ - double frequency = 0.0; - //Truncate names to 4-digit "A*0101" format - if (allelename.length() >= 10){ - allelename=allelename.substring(4,10); - }else{ - allelename=allelename.substring(4); - } - if (AlleleFrequencies.containsKey(allelename)){ - frequency = Double.parseDouble((String) AlleleFrequencies.get(allelename).toString()); - }else{ - frequency=0.0001; - } - return frequency; - } - - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - @Override - public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java deleted file mode 100644 index 99d104ff6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java +++ /dev/null @@ -1,152 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Hashtable; -import java.io.PrintStream; - -/** - * Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class FindPolymorphicSitesWalker extends ReadWalker { - @Output - public PrintStream out; - - @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) - public String debugRead = ""; - - @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false) - public boolean findFirst = false; - - @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false) - public String debugAllele = ""; - - @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) - public String ethnicity = "Caucasian"; - - @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) - public boolean ONLYFREQUENT = false; - - String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt"; - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; - - String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt"; - String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt"; - HLAFileReader HLADictionaryReader = new HLAFileReader(); - - boolean DatabaseLoaded = false; - boolean DEBUG = false; - - String[] HLAnames, HLAreads; - Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; - double[] SingleAlleleFrequencies; - - double[] nummatched, concordance, numcompared; - int numHLAlleles = 0; - int minstartpos = 0; - int maxstoppos = 0; - - int HLA_A_start = 30018310; - int HLA_A_end = 30021211; - - Hashtable AlleleFrequencies = new Hashtable(); - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - CigarParser formatter = new CigarParser(); - - public Integer reduceInit() { - if (!DatabaseLoaded){ - DatabaseLoaded = true; - - //Load HLA dictionary - out.printf("INFO Loading HLA dictionary ... "); - - HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetSequences(); - HLAnames = HLADictionaryReader.GetNames(); - HLAstartpos = HLADictionaryReader.GetStartPositions(); - HLAstoppos = HLADictionaryReader.GetStopPositions(); - minstartpos = HLADictionaryReader.GetMinStartPos(); - maxstoppos = HLADictionaryReader.GetMaxStopPos(); - - out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); - - nummatched = new double[HLAreads.length]; - concordance = new double[HLAreads.length]; - numcompared = new double[HLAreads.length]; - - FindPolymorphicSites(minstartpos,maxstoppos); - - out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); - out.printf("INFO Comparing reads to database ...\n"); - - if (DEBUG){ - //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); - } - } - return 0; - } - - private void FindPolymorphicSites(int start, int stop){ - boolean initialized, polymorphic, examined; - char c = ' ', ch = ' '; - int A = 0, C = 0, G = 0, T = 0; - ArrayList polymorphicsites = new ArrayList(); - ArrayList nonpolymorphicsites = new ArrayList(); - //Find polymorphic sites in dictionary - for (int pos = start; pos <= stop; pos++){ - initialized = false; polymorphic = false; examined = false; - //look across all alleles at specific position to see if it is polymorphic - A = 0; C = 0; G = 0; T = 0; - for (int i = 0; i < HLAreads.length; i++){ - if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ - if (!initialized){ - c = HLAreads[i].charAt(pos-HLAstartpos[i]); - initialized = true; - examined = true; - } - ch = HLAreads[i].charAt(pos-HLAstartpos[i]); - if (ch == 'A'){A++;} - else if (ch == 'C'){C++;} - else if (ch == 'T'){T++;} - else if (ch == 'G'){G++;} - - if (ch != c){ - // polymorphicsites.add(pos); - // out.printf("POLYMORPHIC\t6\t%s\n", pos); - polymorphic = true; - // break; - } - } - } - if (polymorphic){ - out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T); - } - //if (!polymorphic && examined){ - // nonpolymorphicsites.add(pos); - // out.printf("CONSERVED\t6\t%s\n", pos); - //} - - } - PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]); - NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]); - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - //Calculate concordance for this read and all overlapping reads - return 1; - } - - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java deleted file mode 100644 index ced40a410..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.Hashtable; -/** - * File reader used by other Walkers to read HLA allele frequencies. - * @author shermanjia - */ -public class FrequencyFileReader { - Hashtable MaxFrequencies = new Hashtable(); - Hashtable CommonAlleles = new Hashtable(); - Hashtable [] AlleleFrequencies = null; - String [] Populations = null; - - public Hashtable [] GetAlleleFrequencies(){ - //return allele frequencies for all populations - return AlleleFrequencies; - } - - public Hashtable GetCommonAlleles(){ - //return list of common alleles - return CommonAlleles; - } - - public Hashtable GetMaxFrequencies(){ - //return list of common alleles - return MaxFrequencies; - } - - public String[] GetPopulations(){ - //Return name of populations - return Populations; - } - - public void ReadFile(String filename, String ethnicity){ - try{ - int linenum = 0; - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - linenum++; - s = strLine.split("\\t"); - if (linenum == 1){ - //Determine number of populations, create a hash table for each population - AlleleFrequencies = new Hashtable[s.length-1]; - Populations = new String[s.length-1]; - for (int i = 1; i < s.length; i++){ - Populations[i-1]=s[i]; - AlleleFrequencies[i-1] = new Hashtable(); - } - }else{ - //assign allele frequencies for each population - for (int i = 1; i < s.length; i++){ - if (Double.valueOf(s[i]) > 0.0001){ - CommonAlleles.put(s[0], s[0]); - } - AlleleFrequencies[i-1].put(s[0],s[i]); - if (!MaxFrequencies.containsKey(s[0])){ - MaxFrequencies.put(s[0], s[i]); - }else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){ - MaxFrequencies.put(s[0], s[i]); - } - } - } - } - in.close(); - }catch (Exception e){//Catch exception if any - System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ")."); - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java deleted file mode 100644 index 4ff43be87..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.*; -import java.util.Map.Entry; -import java.io.PrintStream; - -/** - * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T HLACaller -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\ -dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class HLACallerWalker extends ReadWalker { - @Output - private PrintStream out; - - @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true) - public String baseLikelihoodsFile = ""; - - @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) - public boolean DEBUG = false; - - @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) - public String filterFile = ""; - - @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) - public String ethnicity = "CaucasiansUSA"; - - @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) - public String debugAlleles = ""; - - @Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals in phase calculation", required = false) - public String IntervalsFile = ""; - - @Argument(fullName = "minFreq", shortName = "minFreq", doc = "only consider alleles greater than this frequency", required = false) - public double minFrequency = 0.0; - - @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false) - public int MAXALLOWEDMISMATCHES = 6; - - @Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false) - public int MINREQUIREDMATCHES = 5; - - @Argument(fullName = "HLAfrequencies", shortName = "HLAfrequencies", doc = "HLA allele frequencies file", required = true) - public String AlleleFrequencyFile = "HLA_FREQUENCIES.txt"; - - @Argument(fullName = "HLAdictionary", shortName = "HLAdictionary", doc = "HLA dictionary file", required = true) - public String HLAdatabaseFile = "HLA_DICTIONARY.txt"; - - @Argument(fullName = "turnOffVerboseOutput", shortName = "noVerbose", doc = "Do not output verbose probability descriptions (INFO lines) ", required = false) - protected boolean NO_VERBOSE = false; - - // Initializing variables - - HLAFileReader HLADictionaryReader = new HLAFileReader(); - boolean HLAdataLoaded = false; - String[] HLAnames, HLAreads, Populations; - ArrayList ReadsToDiscard; - Integer[] HLAstartpos, HLAstoppos, PolymorphicSites; - - - int[][] numObservations, totalObservations, intervals; - int[] SNPnumInRead, SNPposInRead, positions; - CigarParser cigarparser = new CigarParser(); - Hashtable MaxLikelihoods = new Hashtable(); - Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount; - Hashtable[] AlleleFrequencies; - int numIntervals; - double[][] baseLikelihoods; - - ArrayList AllelesToSearch = new ArrayList(); - - // setting error rates for phasing algorithm (1% expected error rate for any genotype) - - double P_err = 0.01; - double P_correct = 1 - P_err; - double L_err = Math.log10(P_err); - double L_correct = Math.log10(P_correct); - - public Integer reduceInit() { - - if (!HLAdataLoaded){ - HLAdataLoaded = true; - - //Load HLA dictionary - - HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetSequences(); - HLAnames = HLADictionaryReader.GetNames(); - HLAstartpos = HLADictionaryReader.GetStartPositions(); - HLAstoppos = HLADictionaryReader.GetStopPositions(); - - //Load pre-processing file for misaligned reads and list of alleles to search - - if (!filterFile.equals("")){ - //If pre-processing file exists, load contents - SimilarityFileReader similarityReader = new SimilarityFileReader(); - similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES); - ReadsToDiscard = similarityReader.GetReadsToDiscard(); - AllelesToSearch = similarityReader.GetAllelesToSearch(); - AlleleCount = similarityReader.GetAlleleCount(); - LocusCount = similarityReader.GetLocusCount(); - if (!NO_VERBOSE) { - for (int i = 0; i < AllelesToSearch.size(); i++){ - out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i))); - } - } - }else{ - ReadsToDiscard = new ArrayList(); - AlleleCount = new Hashtable(); - String name, d4_name; String [] n; - for (int i = 0; i < HLAnames.length; i++){ - name = HLAnames[i].substring(4); - n = name.split("\\*"); - d4_name = n[0] + "*" + n[1].substring(0, 4); - if (!AllelesToSearch.contains(d4_name)){ - AllelesToSearch.add(d4_name); - AlleleCount.put(d4_name, 0); - } - if (!LocusCount.containsKey(n[0])){ - LocusCount.put(n[0], 0); - } - } - } - - //Load genotypes and find polymorphic sites (sites that differ from reference) - - BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader(); - baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true); - baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods(); - positions = baseLikelihoodsReader.GetPositions(); - PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites(); - if (!NO_VERBOSE) { - out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length); - } - - int l = PolymorphicSites.length; - SNPnumInRead = new int[l]; - SNPposInRead = new int[l]; - numObservations = new int[l*5][l*5]; - totalObservations = new int[l][l]; - - //Load allele frequencies for different populations - - FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); - HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity); - AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); - MaxFrequencies = HLAfreqReader.GetMaxFrequencies(); - CommonAlleles = HLAfreqReader.GetCommonAlleles(); - Populations = HLAfreqReader.GetPopulations(); - - //Load genomic intervals for bam file - - if (!IntervalsFile.equals("")){ - TextFileReader fileReader = new TextFileReader(); - fileReader.ReadFile(IntervalsFile); - String[] lines = fileReader.GetLines(); - intervals = new int[lines.length][2]; - for (int i = 0; i < lines.length; i++) { - String[] s = lines[i].split(":"); - String[] intervalPieces = s[1].split("-"); - intervals[i][0] = Integer.valueOf(intervalPieces[0]); - intervals[i][1] = Integer.valueOf(intervalPieces[1]); - } - numIntervals = intervals.length; - } - - - } - return 0; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if (!ReadsToDiscard.contains(read.getReadName())){ - UpdateCorrelation(read); - }else{ - - } - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer numreads) { - String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2; - Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, likelihood = 0.0; - int numCombinations = 0; - - //For debugging specific alleles - if (!debugAlleles.equals("")){ - String s[] = debugAlleles.split(","); - int index1 = HLADictionaryReader.GetIndex(s[0]); - int index2 = HLADictionaryReader.GetIndex(s[1]); - out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2); - if (index1 > -1 && index2 > -1){ - alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true); - phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false); - } - } - - double max; - ArrayList Output = new ArrayList(); - ArrayList Likelihoods = new ArrayList(); - Hashtable TotalProb = new Hashtable(); - //Search pairs of alleles that satisfy initial search criteria - - // Allele 1 - - for (int i = 0; i < HLAnames.length; i++){ - name1 = HLAnames[i].substring(4); - String [] n1 = name1.split("\\*"); - d4_name1 = n1[0] + "*" + n1[1].substring(0, 4); - d2_name1 = n1[0] + "*" + n1[1].substring(0, 2); - if (AllelesToSearch.contains(d4_name1)){ - - if (MaxFrequencies.containsKey(d4_name1)){ - frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString()); - }else{ - if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;} - } - - if (frq1 > minFrequency){ - - // Allele 2 - - for (int j = i; j < HLAnames.length; j++){ - name2 = HLAnames[j].substring(4); - String [] n2 = name2.split("\\*"); - d4_name2 = n2[0] + "*" + n2[1].substring(0, 4); - d2_name2 = n2[0] + "*" + n2[1].substring(0, 2); - if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){ - if (MaxFrequencies.containsKey(d4_name2)){ - frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString()); - }else{ - if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;} - } - - if (frq2 > minFrequency){ - - //Calculate allele and phase likelihoods for each allele pair - alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false); - numCombinations++; - - //If there is data at the allele pair, continue with other calculations - - if (alleleLikelihood < 0){ - phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false); - log1=Math.log10(frq1); - log2=Math.log10(frq2); - - //sum likelihoods - - likelihood = alleleLikelihood+phaseLikelihood+log1+log2; - if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);} - - if (likelihood > (Double) MaxLikelihoods.get(n1[0])) { - MaxLikelihoods.put(n1[0], likelihood); - } - Likelihoods.add(likelihood); - String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood); - Output.add(data); - if (!NO_VERBOSE) { - out.printf("INFO\t%s\n",data); - } - if (DEBUG){ - - } - } - } - } - } - } - } - } - //Print output - out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP"); - for (int i = 0; i < Populations.length; i++){ - out.printf("\t%s",Populations[i]); - } - out.printf("\n"); - - //Calculate probabilities for each locus - Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0; - Integer count = 0; - Hashtable HLA4DigitProbs = new Hashtable(); - Hashtable HLA4DigitLs = new Hashtable(); - Hashtable HLA4DigitCount = new Hashtable(); - Hashtable HLA4DigitF1 = new Hashtable(); - Hashtable HLA4DigitF2 = new Hashtable(); - Hashtable HLA4DigitA = new Hashtable(); - Hashtable HLA4DigitP = new Hashtable(); - - String key; - Enumeration keys = LocusCount.keys(); - while (keys.hasMoreElements()){ - String locus = keys.nextElement().toString(); - - probSum = 0.0; - ArrayList localOutput = new ArrayList(); - ArrayList localLikelihoods = new ArrayList(); - - //Sum probabilities for each locus - - for (int j = 0; j < Output.size(); j++){ - String data = Output.get(j).toString(); - String [] d = data.split("\\t"); - if (d[0].equals(locus)){ - localOutput.add(data); - likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus); - localLikelihoods.add(likelihood); - probSum = probSum + Math.pow(10, likelihood); - //out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum); - } - } - - //aggregate statistics for 4-digit types - - String A1 = "", A2 = "", a1 = "", a2 = ""; - String [] s, s1, s2; - Double prob4digit = 0.0; - int n = 0; - - for (int j = 0; j < localOutput.size(); j++){ - String data = localOutput.get(j).toString(); - prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum; - - if (prob > 0.005){ - s = data.split("\\t"); - s1 = s[1].split("\\*"); - s2 = s[2].split("\\*"); - a1 = s1[0] + "*" + s1[1].substring(0,4); - a2 = s2[0] + "*" + s2[1].substring(0,4); - key = a1 + "," + a2; - aLikelihood4 = Double.valueOf(s[3]); - pLikelihood4 = Double.valueOf(s[4]); - likelihood = aLikelihood4 + pLikelihood4 + f1 + f2; - f1 = Double.valueOf(s[5]); - f2 = Double.valueOf(s[6]); - if (!HLA4DigitProbs.containsKey(key)){ - HLA4DigitProbs.put(key, prob); - HLA4DigitLs.put(key, likelihood); - HLA4DigitCount.put(key, 1); - HLA4DigitF1.put(key,f1); - HLA4DigitF2.put(key,f2); - HLA4DigitA.put(key,aLikelihood4); - HLA4DigitP.put(key,pLikelihood4); - }else{ - prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString()); - HLA4DigitProbs.put(key, prob); - likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString()); - HLA4DigitLs.put(key, likelihood); - n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1; - HLA4DigitCount.put(key, n); - aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString()); - HLA4DigitA.put(key, aLikelihood4); - pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString()); - HLA4DigitP.put(key, pLikelihood4); - } - - } - } - } - - //Print results - Enumeration P = HLA4DigitProbs.keys(); - String K = ""; String [] s, s1, s2; - double count1, count2, locusCount, accountedFor; - - // Sort hashtable. - Vector v = new Vector(HLA4DigitProbs.keySet()); - Collections.sort(v); - - // Display (sorted) hashtable. - for (Enumeration e = v.elements(); e.hasMoreElements();) { - K = (String)e.nextElement(); - prob = (Double) HLA4DigitProbs.get(K); - - likelihood = (Double) HLA4DigitLs.get(K); - count = (Integer) HLA4DigitCount.get(K); - s = K.split("\\,"); - s1 = s[0].split("\\*"); name1 = s1[1]; - s2 = s[1].split("\\*"); name2 = s2[1]; - aLikelihood4 = (Double) HLA4DigitA.get(K); - pLikelihood4 = (Double) HLA4DigitP.get(K); - f1 = (Double) HLA4DigitF1.get(K); - f2 = (Double) HLA4DigitF2.get(K); - count1 = Double.valueOf(AlleleCount.get(s[0]).toString()); - count2 = Double.valueOf(AlleleCount.get(s[1]).toString()); - locusCount = Double.valueOf(LocusCount.get(s1[0]).toString()); - if (s[0].equals(s[1])){ - accountedFor = count1 / locusCount; - }else{ - accountedFor = (count1 + count2) / locusCount; - } - if (prob > 0.1){ - out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor); - for (int i = 0; i < Populations.length; i++){ - if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;} - if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;} - if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");} - } - out.print("\n"); - } - if (!NO_VERBOSE) { - out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor); - } - for (int i = 0; i < Populations.length; i++){ - if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;} - if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;} - if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");} - } - out.print("\n"); - } - } - - Comparator valueComparator = new Comparator() { - @Override public int compare(Double val1, Double val2) { - return val1.compareTo(val2); - } - }; - - private Integer[] InitializePolymorphicSites(){ - int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1; - int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1; - int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1; - Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions]; - for (int i = 0; i < num_A_positions; i++){ - polymorphicSites[i]=HLA_A_start + i; - } - for (int i = 0; i < num_C_positions; i++){ - polymorphicSites[i+num_A_positions]=HLA_C_start + i; - } - for (int i = 0; i < num_B_positions; i++){ - polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i; - } - return polymorphicSites; - } - - private int IndexOf(char c){ - switch(c){ - case 'A': return 0; - case 'C': return 1; - case 'G': return 2; - case 'T': return 3; - //case 'D': return 4; - default: return -1; - } - } - - - private boolean IsWithinInterval(int pos){ - boolean isWithinInterval = false; - for (int i = 0; i < numIntervals; i++){ - if (pos >= intervals[i][0] && pos <= intervals[i][1]){ - isWithinInterval = true; - break; - } - } - return isWithinInterval; - } - - private void UpdateCorrelation(SAMRecord read){ - //Updates correlation table with SNPs from specific read (for phasing) - String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString()); - ArrayList SNPsInRead = new ArrayList(); - ArrayList readindex = new ArrayList(); - - int readstart = read.getAlignmentStart(); - int readend = read.getAlignmentEnd(); - int numPositions = PolymorphicSites.length; - char c1, c2; - int a, b, i, j, SNPcount = 0; - - //Find all SNPs in read - for (i = 0; i < numPositions; i++){ - if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){ - SNPnumInRead[i] = SNPcount; - SNPposInRead[i] = PolymorphicSites[i]-readstart; - SNPcount++; - }else{ - SNPnumInRead[i] = -1; - SNPposInRead[i] = -1; - } - } - - //Update correlation table; for each combination of SNP positions - for (i = 0; i < numPositions; i++){ - if (SNPnumInRead[i] > -1){ - c1 = s.charAt(SNPposInRead[i]); - if (IndexOf(c1) > -1){ - for (j = i+1; j < numPositions; j ++){ - if (SNPnumInRead[j] > -1){ - c2 = s.charAt(SNPposInRead[j]); - if (IndexOf(c2) > -1){ - a = i*5 + IndexOf(c1); - b = j*5 + IndexOf(c2); - - numObservations[a][b]++; - totalObservations[i][j]++; - if (DEBUG){ - //out.printf("INFO %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b); - } - } - } - } - - } - } - } - } - -private int GenotypeIndex(char a, char b){ - switch(a){ - case 'A': - switch(b){ - case 'A': return 0; - case 'C': return 1; - case 'G': return 2; - case 'T': return 3; - }; - case 'C': - switch(b){ - case 'A': return 1; - case 'C': return 4; - case 'G': return 5; - case 'T': return 6; - }; - case 'G': - switch(b){ - case 'A': return 2; - case 'C': return 5; - case 'G': return 7; - case 'T': return 8; - }; - case 'T': - switch(b){ - case 'A': return 3; - case 'C': return 6; - case 'G': return 8; - case 'T': return 9; - }; - default: return -1; - } - } - - private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){ - //Calculates likelihood for specific allele pair - String read1 = HLAalleles[a1]; - String read2 = HLAalleles[a2]; - int start1 = HLAstartpos[a1]; - int start2 = HLAstartpos[a2]; - int stop1 = HLAstoppos[a1]; - int stop2 = HLAstoppos[a2]; - double likelihood = 0; - int pos, index; - char c1, c2; - - - for (int i = 0; i < positions.length; i++){ - pos = positions[i]; - if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){ - index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2)); - if (index > -1){ - likelihood = likelihood + baseLikelihoods[i][index]; - if (!NO_VERBOSE || debug){ - c1 = read1.charAt(pos-start1); - c2 = read2.charAt(pos-start2); - out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood); - } - } - } - } - return likelihood; - } - - private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){ - //calculate the likelihood that the particular combination of alleles satisfies the phase count data - double likelihood = 0, prob = 0; - int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1]; - int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2]; - int combinedstart = Math.max(readstart1,readstart2); - int combinedstop = Math.min(readend1,readend2); - - int numPositions = PolymorphicSites.length, SNPcount = 0; - int i, j, a1, a2, b1, b2; - char c11, c12, c21, c22; - int numInPhase = 0, numOutOfPhase = 0; - double sumInPhase = 0.0, sumObservations = 0.0; - - - //Find all SNPs in read - for (i = 0; i < numPositions; i++){ - - if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop ){ // && IsWithinInterval(PolymorphicSites[i]) - if (PRINTDEBUG){ - out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount); - } - SNPnumInRead[i] = SNPcount; - SNPposInRead[i] = PolymorphicSites[i]-combinedstart; - SNPcount++; - }else{ - SNPnumInRead[i] = -1; - SNPposInRead[i] = -1; - } - } - String s1 = HLAreads[alleleIndex1]; - String s2 = HLAreads[alleleIndex2]; - if (PRINTDEBUG){ - out.printf("DEBUG %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop); - } - //Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination - for (i = 0; i < numPositions; i++){ - if (SNPnumInRead[i] > -1){ - c11 = s1.charAt(SNPposInRead[i]); - c21 = s2.charAt(SNPposInRead[i]); - if (IndexOf(c11) > -1 && IndexOf(c21) > -1){ - for (j = i+1; j < numPositions; j ++){ - if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){ - c12 = s1.charAt(SNPposInRead[j]); - c22 = s2.charAt(SNPposInRead[j]); - if (IndexOf(c12) > -1 && IndexOf(c22) > -1){ - a1 = i*5 + IndexOf(c11); - b1 = j*5 + IndexOf(c12); - a2 = i*5 + IndexOf(c21); - b2 = j*5 + IndexOf(c22); - //check if the two alleles are identical at the chosen 2 locations - if ((c11 == c21) && (c12 == c22)){ - numInPhase = numObservations[a1][b1]; - }else{ - numInPhase = numObservations[a1][b1] + numObservations[a2][b2]; - } - numOutOfPhase = totalObservations[i][j] - numInPhase; - sumInPhase += (double) numInPhase; - sumObservations += (double) totalObservations[i][j]; - if (SINGLEALLELE){ - likelihood = sumInPhase / sumObservations; - }else{ - likelihood += numInPhase * L_correct + numOutOfPhase * L_err; - } - //prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001); - //likelihood += Math.log10(prob); - //likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10); - - if (PRINTDEBUG){ - out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood); - } - break; - } - } - } - } - } - } - return likelihood; - } - - private void ExtraCode(){ - String name1, name2; - //Pre-process homozygous combinations to determine top possible alleles (for efficiency) - Hashtable Alleles2Digit = new Hashtable(); - Hashtable Phase2Digit = new Hashtable(); - Hashtable Count2Digit = new Hashtable(); - - Hashtable AllelesAtLocus = new Hashtable(); - ArrayList Loci = new ArrayList(); - double[] AlleleLikelihoods2 = new double[HLAnames.length]; - double[] PhaseLikelihoods2 = new double[HLAnames.length]; - for (int i = 0; i < HLAnames.length; i++){ - name1 = HLAnames[i].substring(4); - String [] n1 = name1.split("\\*"); - AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false); - PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true); - if (AlleleLikelihoods2[i] < 0){ - name2 = n1[0] + "*" + n1[1].substring(0, 4); - if (!Loci.contains(n1[0])){ - Loci.add(n1[0]); - MaxLikelihoods.put(n1[0], 0.0); - AllelesAtLocus.put(n1[0], 1); - }else{ - AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0])); - } - if (!Alleles2Digit.containsKey(name2)){ - Alleles2Digit.put(name2, AlleleLikelihoods2[i]); - Phase2Digit.put(name2, PhaseLikelihoods2[i]); - Count2Digit.put(name2, 1.0); - }else { - if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){ - Alleles2Digit.put(name2, AlleleLikelihoods2[i]); - } - if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){ - Phase2Digit.put(name2, PhaseLikelihoods2[i]); - } - Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2)); - } - } - } - - //Sort alleles at 2 digit resolution for each locus - - for (int i = 0; i < Loci.size(); i++){ - Enumeration k = Alleles2Digit.keys(); - Hashtable AllelesAtLoci = new Hashtable(); - HashMap map = new HashMap(); - int numalleles = 0; - //find alleles at the locus - while( k.hasMoreElements() ){ - name1 = k.nextElement().toString(); - String [] n1 = name1.split("\\*"); - if (Loci.get(i).equals(n1[0])){ - numalleles++; - map.put(name1,-1 * (Double) Alleles2Digit.get(name1)); - AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1); - //out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1)); - } - - } - - //Sort alleles at locus, mark top six 2-digit classes for deep search - List> entries = new ArrayList>(map.entrySet()); - Collections.sort(entries, new Comparator>() { - public int compare(Entry e1, Entry e2) { - return e1.getValue().compareTo(e2.getValue()); - } - }); - int num = 1; - for (Map.Entry entry : entries) { - if (num <= Math.max(5,entries.size()/8)){ - AllelesToSearch.add(entry.getKey()); - if (!NO_VERBOSE) { - out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); - } - num++; - }else if (!NO_VERBOSE) { - if (!AllelesToSearch.contains(entry.getKey())){ - out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); - }else{ - out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); - } - } - } - - if (!NO_VERBOSE) { - out.printf("INFO\n"); - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java deleted file mode 100644 index fc78ff078..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.ArrayList; -/** - * - * @author shermanjia - */ -public class HLAFileReader { - ArrayList Sequences = new ArrayList(); - ArrayList Names = new ArrayList(); - ArrayList StartPositions = new ArrayList(); - ArrayList StopPositions = new ArrayList(); - int minstartpos; - int maxstoppos; - - CigarParser formatter = new CigarParser(); - - public String[] GetNames(){ - return Names.toArray(new String[Names.size()]); - } - - public String[] GetSequences(){ - return Sequences.toArray(new String[Sequences.size()]); - } - - public Integer[] GetStartPositions(){ - return StartPositions.toArray(new Integer[StartPositions.size()]); - } - - public Integer[] GetStopPositions(){ - return StopPositions.toArray(new Integer[StopPositions.size()]); - } - - - public Integer GetMinStartPos(){ - return minstartpos; - } - - public Integer GetMaxStopPos(){ - return maxstoppos; - } - - public int GetIndex(String readname){ - if (Names.contains(readname)){ - return Names.indexOf(readname); - }else{ - return -1; - } - } - - public void ReadFile(String filename){ - try{ - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - Sequences.add(s[3]); - Names.add(s[0]); - StartPositions.add(Integer.valueOf(s[1])); - StopPositions.add(Integer.valueOf(s[2])); - minstartpos = Math.min(minstartpos, Integer.valueOf(s[1])); - maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2])); - } - in.close(); - }catch (Exception e){//Catch exception if any - System.err.println("HLAFileReader Error: " + e.getMessage()); - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java deleted file mode 100644 index 393a3a0b9..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java +++ /dev/null @@ -1,308 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Output; - -import java.io.*; -import java.util.ArrayList; -import java.util.Hashtable; -/** - * ImputeAllelesWalker fills in missing intronic info for HLA alleles based on the the most similar HLA allele per read - * @author shermanjia - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class ImputeAllelesWalker extends ReadWalker { - @Output - PrintStream out; - - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam"; -// String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest"; - String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest"; - - boolean DatabaseLoaded = false; - boolean DEBUG = false; - - ArrayList HLAreads = new ArrayList(); - ArrayList HLAcigars = new ArrayList(); - ArrayList HLAnames = new ArrayList(); - ArrayList HLApositions = new ArrayList(); - double[] SingleAlleleFrequencies; - - int numHLAlleles = 0; - int[] HLAstartpos; - int[] HLAstoppos; - int minstartpos = 0; - int maxstoppos = 0; - - int HLA_A_start = 30018310; - int HLA_A_end = 30021211; - int HLA_B_start = 31430239; - int HLA_B_end = 31432914; - int HLA_C_start = 31344925; - int HLA_C_end = 31347827; - int HLA_DQA1_start = 32713161; - int HLA_DQA1_end = 32719407; - int HLA_DQB1_start = 32735635; - int HLA_DQB1_end = 32742444; - int HLA_DPA1_start = 33140772; - int HLA_DPA1_end = 33149356; - int HLA_DPB1_start = 33151738; - int HLA_DPB1_end = 33162954; - int HLA_DRB1_start = 32654525; - int HLA_DRB1_end = 32665540; - - - ArrayList PolymorphicSites = new ArrayList(); - - Hashtable ClosestAllele = new Hashtable(); - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1; - CigarParser formatter = new CigarParser(); - - public Integer reduceInit() { - if (!DatabaseLoaded){ - try{ - out.printf("Reading HLA database ...\n"); - FileInputStream fstream = new FileInputStream(HLAdatabaseFile); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int i = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - - if (s.length>=10){ - //Parse the reads with cigar parser - HLAreads.add(formatter.FormatRead(s[5],s[9])); - HLAcigars.add(s[5]); - HLAnames.add(s[0]); - - HLApositions.add(s[3]); - if (s[0].indexOf("HLA_A") > -1){ - if (iAstart < 0){iAstart=i;} - iAstop = i; i++; - }else if (s[0].indexOf("HLA_B") > -1){ - if (iBstart < 0){iBstart=i;} - iBstop = i; i++; - }else if (s[0].indexOf("HLA_C") > -1){ - if (iCstart < 0){iCstart=i;} - iCstop = i; i++; - }else if (s[0].indexOf("HLA_DRB1") > -1){ - if (iDRBstart < 0){iDRBstart=i;} - iDRBstop = i; i++; - }else if (s[0].indexOf("HLA_DQA1") > -1){ - if (iDQAstart < 0){iDQAstart=i;} - iDQAstop = i; i++; - }else if (s[0].indexOf("HLA_DQB1") > -1){ - if (iDQBstart < 0){iDQBstart=i;} - iDQBstop = i; i++; - }else if (s[0].indexOf("HLA_DPA1") > -1){ - if (iDPAstart < 0){iDPAstart=i;} - iDPAstop = i; i++; - }else if (s[0].indexOf("HLA_DPB1") > -1){ - if (iDPBstart < 0){iDPBstart=i;} - iDPBstop = i; i++; - } - } - } - in.close(); - int n = HLApositions.size(); numHLAlleles = n; - HLAstartpos = new int[n]; HLAstoppos = new int[n]; - SingleAlleleFrequencies = new double[n]; - - - for (i = 0; i < n; i++){ - //Find start and stop positions for each allele - HLAstartpos[i]=Integer.parseInt(HLApositions.get(i)); - HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; - if (minstartpos == 0){minstartpos = HLAstartpos[i];} - minstartpos = Math.min(minstartpos, HLAstartpos[i]); - maxstoppos = Math.max(maxstoppos, HLAstoppos[i]); - SingleAlleleFrequencies[i]=0.0; - //Initialize matrix of probabilities / likelihoods - - } - out.printf("DONE! Read %s alleles\n",HLAreads.size()); - }catch (Exception e){//Catch exception if any - System.err.println("ImputeAllelsWalker Error: " + e.getMessage()); - } - - try{ - out.printf("Reading closest allele file ..."); - FileInputStream fstream = new FileInputStream(ClosestAllelesFile); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int count = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - ClosestAllele.put(s[0], s[2]); -// out.printf("loading: %s\t%s\n",s[0],s[2]); - count++; - } - in.close(); - out.printf("Done! Read %s alleles\n",count); - }catch (Exception e){//Catch exception if any - System.err.println("ImputeAllelsWalker Error: " + e.getMessage()); - } - - char c; - DatabaseLoaded = true; - - out.printf("Imputing alleles ...\n"); - - if (DEBUG){ - //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); - } - } - return 0; - } - - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - int readstart = read.getAlignmentStart(); - int readstop = read.getAlignmentEnd(); - int startimputation = 0, stopimputation = 0; - - String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString()); - char c; - String readstring = "", name = "", cigar = "", qualitystring = ""; - int numM = 0, numI = 0, numD = 0; - - name = read.getReadName(); - - String matchedAllele = (String) ClosestAllele.get(name); - - //out.printf("%s\t%s\n",name,matchedAllele); - int index = HLAnames.indexOf(matchedAllele); - - String matchedRead = HLAreads.get(index); - - if (name.indexOf("HLA_A") > -1){ - startimputation = HLA_A_start; - stopimputation = HLA_A_end; - } else if (name.indexOf("HLA_B") > -1){ - startimputation = HLA_B_start; - stopimputation = HLA_B_end; - } else if (name.indexOf("HLA_C") > -1){ - startimputation = HLA_C_start; - stopimputation = HLA_C_end; - } else if (name.indexOf("HLA_DRB1") > -1){ - startimputation = HLA_DRB1_start; - stopimputation = HLA_DRB1_end; - } else if (name.indexOf("HLA_DQA1") > -1){ - startimputation = HLA_DQA1_start; - stopimputation = HLA_DQA1_end; - } else if (name.indexOf("HLA_DQB1") > -1){ - startimputation = HLA_DQB1_start; - stopimputation = HLA_DQB1_end; - } else if (name.indexOf("HLA_DPA1") > -1){ - startimputation = HLA_DPA1_start; - stopimputation = HLA_DPA1_end; - } else if (name.indexOf("HLA_DPB1") > -1){ - startimputation = HLA_DPB1_start; - stopimputation = HLA_DPB1_end; - } - - //out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation); - for (int i = startimputation; i <= stopimputation; i++){ - //if position is within read - if (i >= readstart && i <= readstop){ - c = s1.charAt(i-readstart); - //if position is not missing - if (c != 'D'){ - readstring = readstring + c; - qualitystring = qualitystring + 'I'; - numM++; - if (numD > 0){ - cigar = cigar + String.valueOf(numD) + "D"; - numD = 0; - } else if (numI > 0){ - cigar = cigar + String.valueOf(numI) + "I"; - numI = 0; - } - //if position is missing, get base from matched allele - }else{ - c = matchedRead.charAt(i-HLAstartpos[index]); - //if matched allele is also missing / deleted at position - if (c == 'D'){ - numD++; - if (numM > 0){ - cigar = cigar + String.valueOf(numM) + "M"; - numM = 0; - } - //if matched allele is not missing / deleted at position - }else{ - readstring = readstring + c; - qualitystring = qualitystring + 'I'; - numM++; - if (numD > 0){ - cigar = cigar + String.valueOf(numD) + "D"; - numD = 0; - } else if (numI > 0){ - cigar = cigar + String.valueOf(numI) + "I"; - numI = 0; - } - } - } - //if position is outside of range of read, look at matched allele - }else{ - //if within range of matched allele - if (i >= HLAstartpos[index] && i <= HLAstoppos[index]){ - c = matchedRead.charAt(i-HLAstartpos[index]); - //if matched allele is also missing / deleted at position - if (c == 'D'){ - numD++; - if (numM > 0){ - cigar = cigar + String.valueOf(numM) + "M"; - numM = 0; - } - //if matched allele is not missing / deleted at position - }else{ - readstring = readstring + c; - qualitystring = qualitystring + 'I'; - numM++; - if (numD > 0){ - cigar = cigar + String.valueOf(numD) + "D"; - numD = 0; - } else if (numI > 0){ - cigar = cigar + String.valueOf(numI) + "I"; - numI = 0; - } - } - }else{ - numD++; - if (numM > 0){ - cigar = cigar + String.valueOf(numM) + "M"; - numM = 0; - } - } - } - } - - if (numM > 0){ - cigar = cigar + String.valueOf(numM) + "M"; - }else if(numD > 0){ - cigar = cigar + String.valueOf(numD) + "D"; - }else if(numI > 0){ - cigar = cigar + String.valueOf(numI) + "I"; - } - - out.printf("%s\t0\t6\t%s\t99\t%s\t*\t0\t0\t%s\t%s\n",name,startimputation,cigar,readstring,qualitystring); - - - return 1; - } - - - - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java deleted file mode 100644 index 73701f2ea..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.ArrayList; -/** - * - * @author shermanjia - */ -public class PolymorphicSitesFileReader { - ArrayList PolymorphicSites = new ArrayList(); - ArrayList NonPolymorphicSites = new ArrayList(); - - - public Integer[] GetPolymorphicSites(){ - return PolymorphicSites.toArray(new Integer[PolymorphicSites.size()]); - } - - public Integer[] GetNonPolymorphicSites(){ - return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]); - } - - public void AddSites(Integer [] sites){ - for (int i = 0; i < sites.length; i++){ - if (!PolymorphicSites.contains(sites[i])){ - PolymorphicSites.add(sites[i]); - } - } - } - - public void ReadFile(String filename){ - try{ - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int i = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - if (Double.valueOf(s[8]) > 0.1){ - PolymorphicSites.add(Integer.valueOf(s[0])); - }else{ - NonPolymorphicSites.add(Integer.valueOf(s[0])); - } - } - in.close(); - }catch (Exception e){//Catch exception if any - System.err.println("PolymorphicSitesFileReader Error: " + e.getMessage()); - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java deleted file mode 100644 index 2269b440c..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -/** - * Returns formatted read given read string and cigar string - * Essentially removes header bases, soft clipped bases, and currently removes insertions - * Deletions coded as "D" - * - * @author shermanjia - */ - -public class ReadCigarFormatter { - public String FormatRead(String cigar, String read){ - // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur - String formattedRead = ""; char c; String count; - int cigarPlaceholder = 0; int subcigarLength = 0; - int readPlaceholder = 0; int subreadLength = 0; - - //reads cigar string - for (int i = 0; i < cigar.length(); i++){ - c = cigar.charAt(i); - if (c == 'M'){ - //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence - subcigarLength = i-cigarPlaceholder; - count = cigar.substring(cigarPlaceholder, i); - - subreadLength = Integer.parseInt(count); - formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength); - - //increment placeholders - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'I'){ - //***NOTE: To be modified later if needed (insertions removed here)*** - - //If reaches I for insertion, get number before 'I' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment placeholders without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'H' || c == 'S'){ - //(H = Headers or S = Soft clipped removed here)*** - - //If reaches H for insertion, get number before 'H' and skip that many characters in sequence - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). - cigarPlaceholder = i+1; - } else if (c == 'D'){ - //If reaches D for deletion, insert 'D' into sequence as placeholder - count = cigar.substring(cigarPlaceholder, i); - subreadLength = Integer.parseInt(count); - - //Add one 'D' for each deleted base - String deletion = ""; - for (int j = 1; j <= subreadLength; j++){ - deletion = deletion + "D"; - } - - //update placeholders - formattedRead = formattedRead + deletion; - cigarPlaceholder = i+1; - } - - } - return formattedRead; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java deleted file mode 100644 index 9d2b509cf..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.ArrayList; -import java.util.Hashtable; -/** - * - * @author shermanjia - */ -public class SimilarityFileReader { - ArrayList ReadsToDiscard = new ArrayList(); - ArrayList AllelesToSearch = new ArrayList(); - Hashtable AlleleCount = new Hashtable(); - Hashtable LocusCount = new Hashtable(); - Hashtable Concordance = new Hashtable(); - Hashtable NumMatches = new Hashtable(); - Hashtable NumMismatches = new Hashtable(); - - public ArrayList GetReadsToDiscard(){ - return ReadsToDiscard; - } - - public ArrayList GetAllelesToSearch(){ - return AllelesToSearch; - } - - public String[] GetReadsToDiscardArray(){ - return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]); - } - - public Hashtable GetAlleleCount(){ - return AlleleCount; - } - - public Hashtable GetLocusCount(){ - return LocusCount; - } - - public Hashtable GetConcordance(){ - return Concordance; - } - - public Hashtable GetNumMatches(){ - return NumMatches; - } - - public Hashtable GetNumMismatches(){ - return NumMismatches; - } - - public void ReadFile(String filename, int minAllowedMismatches, int minRequiredMatches){ - try{ - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null, alleles = null, a = null; String allele; - //Read File Line By Line - int i = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - if (s.length >= 6){ - Double matchFraction = Double.valueOf(s[3]); - int numMismatches = Integer.valueOf(s[5]); - int numMatches = Integer.valueOf(s[4]); - Concordance.put(s[0],matchFraction); - NumMatches.put(s[0], s[4]); - NumMismatches.put(s[0], numMismatches); - if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < minRequiredMatches){ - ReadsToDiscard.add(s[0]); - }else{ - Hashtable fourDigitAlleles = new Hashtable(); - alleles = s[6].split("\\,"); - if (alleles.length > 0){ - a = alleles[0].split("\\_"); - s = a[1].split("\\*"); - if (!LocusCount.containsKey(s[0])){ - LocusCount.put(s[0], 1); - }else{ - LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1); - } - } - for (int j = 0; j < alleles.length; j++){ - a = alleles[j].split("\\_"); - s = a[1].split("\\*"); - allele = s[0] + "*" + s[1].substring(0,4); - - if (!fourDigitAlleles.containsKey(allele)){ - fourDigitAlleles.put(allele, allele); - if (!AlleleCount.containsKey(allele)){ - AlleleCount.put(allele, 1); - }else{ - AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1); - } - - if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){ - AllelesToSearch.add(allele); - } - } - } - } - } - } - in.close(); - }catch (Exception e){//Catch exception if any - //System.err.println("SimilarityFile Error: " + e.getMessage()); - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java deleted file mode 100644 index adc52546c..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * To change this template, choose Tools | Templates - * and open the template in the editor. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import java.io.*; -import java.util.ArrayList; -/** - * - * @author shermanjia - */ -public class TextFileReader { - ArrayList lines = new ArrayList(); - int numLines = 0; - - public String[] GetLines(){ - return lines.toArray(new String[lines.size()]); - } - - public int GetNumLines(){ - return numLines; - } - - public void ReadFile(String filename){ - try{ - FileInputStream fstream = new FileInputStream(filename); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - lines.add(strLine); - numLines++; - } - in.close(); - }catch (Exception e){//Catch exception if any - System.err.println("TextFileReader Error: " + e.getMessage()); - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java deleted file mode 100755 index 96d3d175a..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.Collection; -import java.io.PrintStream; - -/** - * Walker to calculate the number of mismatches, their base counts, and their quality sums at confidence ref sites" - */ -@By(DataSource.REFERENCE) -public class LocusMismatchWalker extends LocusWalker implements TreeReducible { - @Output - PrintStream out; - - //@Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false) - //int confidentRefThreshold = 5; - @Argument(fullName="maxNumMismatches",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false) - int maxNumMismatches = 100; - @Argument(fullName="minMappingQuality", doc ="Set the alignment quality below which to ignore reads; defaults to 30", required = false) - int minMappingQuality = 1; - @Argument(fullName="minDepth",doc="Set the minimum number of reads at a locus before choosing to use it in calculation. Defaults to 20.", required=false) - int minDepth = 10; - @Argument(fullName="maxDepth",doc="Set the minimum number of reads at a locus before choosing to use it in calculation. Defaults to 20.", required=false) - int maxDepth = 100; - @Argument(fullName="minBaseQuality", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to 20", required = false) - int minQualityScore = 1; - @Argument(fullName="maxBaseQuality", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to no restriction", required = false) - int maxQualityScore = 99; - @Argument(fullName="minMismatches", doc = "Minimum number of mismatches at a locus before a site is displayed", required = false) - int minMismatches = 1; - - @Argument(fullName="skip", doc = "Only display every skip eligable sites. Defaults to all sites", required = false) - int skip = 1; - - private UnifiedGenotyperEngine ug; - - public void initialize() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - ug = new UnifiedGenotyperEngine(getToolkit(), uac); - - // print the header - out.printf("loc ref genotype genotypeQ depth nMM qSumMM A C G T%n"); - } - - public String map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - String result = null; - - ReadBackedPileup pileup = context.getBasePileup(); - if ( locusIsUsable(tracker, ref, pileup, context) ) { - Genotype g = getGenotype(tracker, ref, context); - if ( g != null ) - result = errorCounts( ref, pileup, g ); - } - - return result; - } - - public Integer reduce( String map, Integer reduce ) { - if ( map != null && (reduce % skip == 0) ) - out.println(map); - - //if (reduce % skip == 0) System.out.printf("Keeping %d%n", reduce); - - return reduce + (map != null ? 1 : 0); - } - - public Integer treeReduce( Integer reduce1, Integer reduce2 ) { - return reduce1 + reduce2; - } - - public Integer reduceInit() { - return 1; - } - - private String errorCounts( ReferenceContext ref, ReadBackedPileup pileup, Genotype g ) { - int[] baseCounts = { 0, 0, 0, 0 }; - int usableDepth = 0; - int nMismatches = 0; - int qSumMismatches = 0; - - for ( PileupElement e : pileup ) { - if ( useRead(e) ) { - //System.out.printf("Using %s%n", e.getRead().getReadName()); - baseCounts[e.getBaseIndex()] += 1; - usableDepth++; - if ( ! BaseUtils.basesAreEqual(e.getBase(), ref.getBase()) ) { - nMismatches++; - qSumMismatches += e.getQual(); - } - } - } - - if ( nMismatches < maxNumMismatches && nMismatches >= minMismatches && usableDepth >= minDepth ) { - StringBuffer baseCountString = new StringBuffer(); - for ( byte b : BaseUtils.BASES ) { - baseCountString.append(baseCounts[BaseUtils.simpleBaseToBaseIndex(b)]); - baseCountString.append(" "); - } - return String.format("%s %c %10s %5.2f %d %d %d %s", - pileup.getLocation(), ref.getBaseAsChar(), - getGenotypeClass(g), 10 * g.getNegLog10PError(), - usableDepth, nMismatches, qSumMismatches, baseCountString.toString()); - } - - return null; - } - - private String getGenotypeClass(Genotype g) { - if ( g.isHomRef() ) return "HOM-REF"; - else if ( g.isHet() ) return "HET"; - else if ( g.isHom() ) return "HOM-NONREF"; - else throw new ReviewedStingException("Unexpected genotype in getGenotypeClass " + g); - } - - public boolean useRead( PileupElement e ) { - if ( e.getRead().getMappingQuality() <= minMappingQuality ) { - return false; - } else if ( ! BaseUtils.isRegularBase( e.getBase() ) ) { - return false; - } else if ( e.getQual() <= minQualityScore || e.getQual() > maxQualityScore ) { - return false; - } else { - return true; - } - } - - private boolean locusIsUsable( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) { - return BaseUtils.isRegularBase(ref.getBase()) && - pileup.size() >= minDepth && pileup.size() < maxDepth && - notCoveredByVariations(tracker, ref) && - pileupContainsNoNs(pileup); -// pileupContainsNoNs(pileup) && -// baseIsConfidentRef(tracker,ref,context); - } - - private boolean notCoveredByVariations( RefMetaDataTracker tracker, ReferenceContext ref ) { - Collection vcs = tracker.getAllVariantContexts(ref); - // TODO: check this logic. I think it's the best approximation of what was here before, but it's a different system - if (vcs != null && vcs.size() > 0 ) { - return false; - } - - return true; - } - - private boolean pileupContainsNoNs(ReadBackedPileup pileup) { - for ( byte c : pileup.getBases() ) { - if ( c == 'N' ) { - return false; - } - } - - return true; - } - - private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context); - if ( calls == null || calls.getNSamples() == 0 || !calls.isSNP() ) - return null; - else { - return calls.getGenotype(0); - } - } - -// private boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { -// Pair> calls = ug.map(tracker,ref,context); -// if ( calls == null || calls.first == null) -// return false; -// else { -// VariationCall var = calls.getFirst(); -// return var.isReference() && var.getNegLog10PError() > confidentRefThreshold; -// //return ( var.isReference() > 0 && !calls.second.get(0).isVariant(ref.getBase()) && calls.second.get(0).getNegLog10PError() > confidentRefThreshold ); -// } -// } - - public void onTraversalDone(Integer result) { - ; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java deleted file mode 100755 index 272e9ad26..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - - -import java.util.ArrayList; -import java.util.List; -import java.util.Collection; -import java.io.PrintStream; - - -/** - * Given a set of reads and variant data from an external source, this walker downsamples the reads at variant - * positions to empirically assess the rate at which variants would be confidently and correctly called given different levels of coverage. - */ -public class SnpCallRateByCoverageWalker extends LocusWalker, String> { - @Output - PrintStream out; - - // Control what goes into the variants file and what format that file should have - @Argument(fullName="min_confidence_threshold", shortName="confidence", doc="The phred-scaled confidence threshold by which variants should be filtered", required=false) public int confidence = 50; - @Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1; - @Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=Integer.MAX_VALUE; - @Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=1; - @Argument(fullName="coverage_step_size", shortName="step", doc="Coverage step size", required=false) public int step=1; - - UnifiedGenotyperEngine UG; - - public void initialize() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.STANDARD_CONFIDENCE_FOR_CALLING = uac.STANDARD_CONFIDENCE_FOR_EMITTING = confidence; - uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - UG = new UnifiedGenotyperEngine(getToolkit(), uac); - - out.println("#locus\tid\tdownsampled_coverage\tpct_coverage\titeration\tref\teval_call\tcomp_call\tvariant_concordance\tgenotype_concordance"); - } - - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 && - context.getBasePileup().size() != 0 && - tracker != null && - tracker.getAllVariantContexts(ref) != null - ); - } - - public List map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Collection contexts = tracker.getAllVariantContexts(ref); - - for (VariantContext vc : contexts) { - if (vc.isVariant() && !vc.isFiltered()) { - //out.println(vc.toString()); - - ArrayList GenotypeCalls = new ArrayList(); - - List reads = context.getReads(); - List offsets = context.getOffsets(); - - int coverage_available = reads.size(); - List coverage_levels = new ArrayList(); - //Integer this_max_coverage = Math.min(max_coverage, coverage_available); - Integer this_max_coverage = 100; - //for (int coverage = min_coverage; coverage <= this_max_coverage; coverage++) { - for (int coverage = min_coverage; coverage <= this_max_coverage; coverage += step) { - coverage_levels.add(coverage); - } - - // Iterate over coverage levels - for (int coverage : coverage_levels) { - int usableCoverage = Math.min(coverage_available, coverage); // don't exceed max available coverage - - Genotype vcCall = vc.getGenotype(0); - Genotype call = null; - int goodIterations = 0; - - for (int r=0; r < downsampling_repeats; r++) { - List subset_indices = MathUtils.sampleIndicesWithReplacement(coverage_available, usableCoverage); - List sub_reads = MathUtils.sliceListByIndices(subset_indices, reads); - List sub_offsets = MathUtils.sliceListByIndices(subset_indices, offsets); - - AlignmentContext subContext = new AlignmentContext(context.getLocation(), new ReadBackedPileupImpl(context.getLocation(),sub_reads, sub_offsets)); - - VariantCallContext calls = UG.calculateLikelihoodsAndGenotypes(tracker, ref, subContext); - - if (calls != null && calls.getNSamples() > 0 && calls.confidentlyCalled) { - Genotype evCall = calls.getGenotype(0); - vcCall = vc.getGenotype(evCall.getSampleName()); - - if ((evCall.isHet() || evCall.isHomVar()) && (vcCall.isHet() || vcCall.isHomVar())) { - call = evCall; - goodIterations++; - } - - } - } - - out.printf("%s\t%s\t\t%d\t%f\t%d\t%c\t%s\t%s\t%d\t%d%n", - context.getLocation(), - vc.hasAttribute(VariantContext.ID_KEY) ? vc.getAttribute(VariantContext.ID_KEY) : "?", - coverage, - ((float) coverage)/((float) reads.size()), - goodIterations, - (char)BaseUtils.baseIndexToSimpleBase(ref.getBaseIndex()), - call == null ? "./." : call.getGenotypeString(), - vcCall.getGenotypeString(), - call == null ? 0 : call.getType() == vcCall.getType() ? 1 : 0, - call == null ? 0 : (call.isHet() || call.isHomVar()) && (vcCall.isHet() || vcCall.isHomVar()) ? 1 : 0); - } - return GenotypeCalls; - } - } - - return null; - } - - public String reduceInit() { - return ""; - } - - public void onTraversalDone(String result) {} // Don't print the reduce result - - public String reduce(List alleleFreqLines, String sum) { - /* - for (String line : alleleFreqLines) { - out.println(line); - } - - */ - return ""; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java deleted file mode 100755 index 94df8c69b..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.TreeSet; - -/** - * This is the validation genotyper. It is not ready to be used by anybody. - * - * @author rpoplin - * @since Oct 20, 2010 - * @help.summary This is the validation genotyper. It is not ready to be used by anybody. - */ - -@Allows(value={DataSource.READS, DataSource.REFERENCE}) -public class ValidationGenotyper extends LocusWalker implements TreeReducible { - - @Output( doc="The output filtered VCF file", required=true) - private PrintStream printStream = null; - - public static class CountedData { - private long numTP = 0L; - private long numTN = 0L; - private long numFP = 0L; - private long numFilteringFN = 0L; - private long numCallingFN = 0L; - - /** - * Adds the values of other to this, returning this - * @param other the other object - */ - public void add(CountedData other) { - numTP += other.numTP; - numTN += other.numTN; - numFP += other.numFP; - numFilteringFN += other.numFilteringFN; - numCallingFN += other.numCallingFN; - } - } - - public enum VARIANT_STATUS { - CALLED, - FILTERED, - MISSING - } - - final private ArrayList evalNames = new ArrayList(); - final private ArrayList compNames = new ArrayList(); - final private TreeSet overlappingSamples = new TreeSet(); - private UnifiedGenotyperEngine engine; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - - for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { - if( d.getName().toLowerCase().startsWith("eval") ) { - evalNames.add( d.getName() ); - } else if( d.getName().toLowerCase().startsWith("comp") ) { - compNames.add( d.getName() ); - } else { - throw new UserException.BadInput("Don't know what to do with input ROD track named: " + d.getName()); - } - } - - if( evalNames.size() != 1 || compNames.size() != 1 ) { - throw new UserException.BadInput("Expecting to see exactly one eval track and exactly one comp track"); - } - - final TreeSet evalSamples = new TreeSet(); - evalSamples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), evalNames)); - final TreeSet compSamples = new TreeSet(); - compSamples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), compNames)); - for( final String sample : evalSamples ) { - if( compSamples.contains( sample ) ) { - overlappingSamples.add( sample ); - } - } - - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - engine = new UnifiedGenotyperEngine(getToolkit(),uac); - - logger.info( "Overlapping samples = " + overlappingSamples ); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - final CountedData counter = new CountedData(); - - if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers - return counter; - } - - VariantContext vcEval = tracker.getVariantContext(ref, evalNames.get(0), null, context.getLocation(), false); - VariantContext vcComp = tracker.getVariantContext(ref, compNames.get(0), null, context.getLocation(), false); - - if( vcEval != null ) { vcEval = vcEval.subContextFromGenotypes( vcEval.getGenotypes(overlappingSamples).values() ); } - if( vcComp != null ) { vcComp = vcComp.subContextFromGenotypes( vcComp.getGenotypes(overlappingSamples).values() ); } - - VARIANT_STATUS evalStatus; - VARIANT_STATUS compStatus; - - // First set the variant status variable for both the eval and comp then decide the site's T/F status - if( vcEval != null && vcEval.isSNP() && vcEval.isPolymorphic() ) { - if( !vcEval.isFiltered() ) { - evalStatus = VARIANT_STATUS.CALLED; - } else { - evalStatus = VARIANT_STATUS.FILTERED; - } - } else { - evalStatus = VARIANT_STATUS.MISSING; - } - if( vcComp != null && vcComp.isSNP() && vcComp.isPolymorphic() ) { - if( !vcComp.isFiltered() ) { - compStatus = VARIANT_STATUS.CALLED; - } else { - compStatus = VARIANT_STATUS.FILTERED; - } - } else { - compStatus = VARIANT_STATUS.MISSING; - } - - if( evalStatus == VARIANT_STATUS.CALLED && compStatus == VARIANT_STATUS.CALLED ) { counter.numTP = 1L; } - else if( evalStatus == VARIANT_STATUS.CALLED && compStatus == VARIANT_STATUS.FILTERED ) { - counter.numFP = 1L; - if( printStream!= null ) { - printStream.println(vcEval.getChr() + ":" + vcEval.getStart() ); // Used to create interval lists of FP variants - } - } - else if( evalStatus == VARIANT_STATUS.CALLED && compStatus == VARIANT_STATUS.MISSING ) { - VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, context); - if( call != null && call.confidentlyCalled && call.getType() == VariantContext.Type.NO_VARIATION ) { - counter.numFP = 1L; - if( printStream!= null ) { - printStream.println(vcEval.getChr() + ":" + vcEval.getStart() ); // Used to create interval lists of FP variants - } - } - } - else if( evalStatus == VARIANT_STATUS.FILTERED && compStatus == VARIANT_STATUS.CALLED ) { counter.numFilteringFN = 1L; } - //if( evalStatus == VARIANT_STATUS.FILTERED && compStatus == VARIANT_STATUS.FILTERED ) { counter.numTP = 1L; } - //if( evalStatus == VARIANT_STATUS.FILTERED && compStatus == VARIANT_STATUS.MISSING ) { counter.numTP = 1L; } - else if( evalStatus == VARIANT_STATUS.MISSING && compStatus == VARIANT_STATUS.CALLED ) { counter.numCallingFN = 1L; } - //if( evalStatus == VARIANT_STATUS.MISSING && compStatus == VARIANT_STATUS.FILTERED ) { counter.numTP = 1L; } - //if( evalStatus == VARIANT_STATUS.MISSING && compStatus == VARIANT_STATUS.MISSING ) { counter.numTP = 1L; } - return counter; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - public CountedData reduceInit() { - return new CountedData(); - } - - public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) { - reduceSum.add(mapValue); - return reduceSum; - } - - public CountedData treeReduce( final CountedData sum1, final CountedData sum2) { - sum2.add(sum1); - return sum2; - } - - public void onTraversalDone( CountedData reduceSum ) { - logger.info("TP = " + reduceSum.numTP); - logger.info("TN = " + reduceSum.numTN); - logger.info("FP = " + reduceSum.numFP); - logger.info("FN = " + (reduceSum.numFilteringFN + reduceSum.numCallingFN) ); - logger.info(" filtering FN = " + reduceSum.numFilteringFN ); - logger.info(" calling FN = " + reduceSum.numCallingFN ); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java deleted file mode 100755 index 115041879..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import org.jgrapht.graph.DefaultDirectedGraph; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 23, 2011 - */ -// simple edge class for connecting nodes in the graph -public class DeBruijnEdge { - - private int multiplicity; - - public DeBruijnEdge() { - multiplicity = 1; - } - - public int getMultiplicity() { - return multiplicity; - } - - public void setMultiplicity(int value) { - multiplicity = value; - } - - public boolean equals(DefaultDirectedGraph graph, DeBruijnEdge edge) { - return (graph.getEdgeSource(this) == graph.getEdgeSource(edge)) && (graph.getEdgeTarget(this) == graph.getEdgeTarget(edge)); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java deleted file mode 100755 index 860695ef5..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 23, 2011 - */ -// simple node class for storing kmer sequences -public class DeBruijnVertex { - - // used for equals() - protected byte[] actualSequence; - - // used for printing and traversing graphs - protected byte[] printableSequence; - - public DeBruijnVertex(byte[] sequence) { - actualSequence = sequence; - printableSequence = new byte[sequence.length]; - System.arraycopy(sequence, 0, printableSequence, 0, sequence.length); - } - - public boolean equals(DeBruijnVertex v) { - return Arrays.equals(actualSequence, v.actualSequence); - } - - public String toString() { - return new String(printableSequence); - } - - public void addPrefix(byte[] prefix, boolean justPrintableSequence) { - printableSequence = addPrefix(printableSequence, prefix); - if ( !justPrintableSequence ) - actualSequence = addPrefix(actualSequence, prefix); - } - - private static byte[] addPrefix(byte[] sequence, byte[] prefix) { - byte[] newSequence = new byte[sequence.length + prefix.length]; - System.arraycopy(prefix, 0, newSequence, 0, prefix.length); - System.arraycopy(sequence, 0, newSequence, prefix.length, sequence.length); - return newSequence; - } - - public void removePrefix(int prefixLength, boolean justPrintableSequence) { - printableSequence = removePrefix(printableSequence, prefixLength); - if ( !justPrintableSequence ) - actualSequence = removePrefix(actualSequence, prefixLength); - } - - private static byte[] removePrefix(byte[] sequence, int prefixLength) { - int newLength = sequence.length - prefixLength; - byte[] newSequence = new byte[newLength]; - System.arraycopy(sequence, prefixLength, newSequence, 0, newLength); - return newSequence; - } - - public void removeSuffix(int suffixLength, boolean justPrintableSequence) { - printableSequence = removeSuffix(printableSequence, suffixLength); - if ( !justPrintableSequence ) - actualSequence = removeSuffix(actualSequence, suffixLength); - } - - private static byte[] removeSuffix(byte[] sequence, int suffixLength) { - int newLength = sequence.length - suffixLength; - byte[] newSequence = new byte[newLength]; - System.arraycopy(sequence, 0, newSequence, 0, newLength); - return newSequence; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java deleted file mode 100755 index f94a30539..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import org.jgrapht.graph.DefaultDirectedGraph; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 23, 2011 - */ -// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. -// This is different from most graph traversals because we want to test paths from any source node to any sink node. -public class KBestPaths { - - // static access only - protected KBestPaths() { } - - // class to keep track of paths - protected static class Path { - - // the last vertex seen in the path - private DeBruijnVertex lastVertex; - - // the list of edges comprising the path - private List edges; - - // the scores for the path - private int totalScore = 0, lowestEdge = -1; - - public Path(DeBruijnVertex initialVertex) { - lastVertex = initialVertex; - edges = new ArrayList(0); - } - - public Path(Path p, DefaultDirectedGraph graph, DeBruijnEdge edge) { - lastVertex = graph.getEdgeTarget(edge); - edges = new ArrayList(p.edges); - edges.add(edge); - totalScore = p.totalScore + edge.getMultiplicity(); - lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity()); - } - - public boolean containsEdge(DefaultDirectedGraph graph, DeBruijnEdge edge) { - for ( DeBruijnEdge e : edges ) { - if ( e.equals(graph, edge)) - return true; - } - - return false; - } - - public List getEdges() { return edges; } - - public int getScore() { return totalScore; } - - public int getLowestEdge() { return lowestEdge; } - - public DeBruijnVertex getLastVertexInPath() { return lastVertex; } - } - - protected static class PathComparator implements Comparator { - public int compare(final Path path1, final Path path2) { - return path1.totalScore - path2.totalScore; - } - } - - public static List getKBestPaths(DefaultDirectedGraph graph, int k) { - PriorityQueue bestPaths = new PriorityQueue(k, new PathComparator()); - - // run a DFS for best paths - for ( DeBruijnVertex v : graph.vertexSet() ) { - if ( graph.inDegreeOf(v) == 0 ) - findBestPaths(graph, new Path(v), k, bestPaths); - } - - return new ArrayList(bestPaths); - } - - private static void findBestPaths(DefaultDirectedGraph graph, Path path, int k, PriorityQueue bestPaths) { - - // did we hit the end of a path? - if ( graph.outDegreeOf(path.lastVertex) == 0 ) { - if ( bestPaths.size() < k ) { - bestPaths.add(path); - } else if ( bestPaths.peek().totalScore < path.totalScore ) { - bestPaths.remove(); - bestPaths.add(path); - } - - return; - } - - // recursively run DFS - for ( DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) { - - // make sure the edge is not already in the path - if ( path.containsEdge(graph, edge) ) - continue; - - Path newPath = new Path(path, graph, edge); - findBestPaths(graph, newPath, k, bestPaths); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java deleted file mode 100755 index 0640d5600..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; - -import java.io.PrintStream; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 14, 2011 - */ -public abstract class LocalAssemblyEngine { - - public enum ASSEMBLER { - SIMPLE_DE_BRUIJN - } - - private PrintStream out; - private IndexedFastaSequenceFile referenceReader; - - protected LocalAssemblyEngine(PrintStream out, IndexedFastaSequenceFile referenceReader) { - this.out = out; - this.referenceReader = referenceReader; - } - - protected PrintStream getOutputStream() { return out; } - - protected IndexedFastaSequenceFile getReferenceReader() { return referenceReader; } - - public abstract void runLocalAssembly(List reads); - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java deleted file mode 100755 index 60019e2b1..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java +++ /dev/null @@ -1,445 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMRecord; -import org.jgrapht.graph.*; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 14, 2011 - */ -public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { - - private static final boolean DEBUG = true; - - // k-mer length - private static final int KMER_LENGTH = 9; - - // minimum base quality required in a contiguous stretch of a given read to be used in the assembly - private static final int MIN_BASE_QUAL_TO_USE = 20; - - // minimum clipped sequence length to consider using - private static final int MIN_SEQUENCE_LENGTH = 25; - - // minimum multiplicity to consider using - // private static final int MIN_MULTIPLICITY_TO_USE = 2; - - // FOR DEBUGGING - private int numReadsToUse = -1; - - // the deBruijn graph object - private DefaultDirectedGraph graph = null; - - - public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) { - super(out, referenceReader); - this.numReadsToUse = numReadsToUse; - } - - public void runLocalAssembly(List reads) { - - // reset the graph - graph = new DefaultDirectedGraph(DeBruijnEdge.class); - - // clip the reads to get just the base sequences we want - List sequences = clipReads(reads); - - // create the graph - createDeBruijnGraph(sequences); - - // find the 2 best paths in the graph - findBestPaths(); - - // assign reads to the graph - assignReadsToGraph(sequences); - } - - // This method takes the base sequences from the SAM records and pulls - // out runs of bases that are not soft-clipped and are all at least Q20s. - // Clipped sequences that are overly clipped are not used. - private List clipReads(List reads) { - List sequences = new ArrayList(reads.size()); - - int counter = 0; - - for ( SAMRecord read : reads ) { - - // for debugging - if ( numReadsToUse >= 0 && ++counter > numReadsToUse ) { - System.out.println("Stopping before read: " + read.getReadName() + " at " + read.getAlignmentStart()); - break; - } - - byte[] sequencedReadBases = read.getReadBases(); - byte[] sequencedBaseQuals = read.getBaseQualities(); - - int curIndex = 0, firstQ20Index = -1; - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - // skip soft-clipped bases - curIndex += elementLength; - break; - case M: - case I: - for (int i = 0; i < elementLength; i++) { - if ( sequencedBaseQuals[curIndex] >= MIN_BASE_QUAL_TO_USE ) { - if ( firstQ20Index == -1 ) - firstQ20Index = curIndex; - } else if ( firstQ20Index != -1 ) { - int sequenceLength = curIndex - firstQ20Index; - if ( sequenceLength > MIN_SEQUENCE_LENGTH ) { - byte[] sequence = new byte[sequenceLength]; - System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength); - sequences.add(sequence); - } - firstQ20Index = -1; - } - curIndex++; - } - case N: - // TODO -- implement me (cut the sequence) - default: - break; - } - } - - if ( firstQ20Index != -1 ) { - int sequenceLength = curIndex - firstQ20Index; - if ( sequenceLength > MIN_SEQUENCE_LENGTH ) { - byte[] sequence = new byte[sequenceLength]; - System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength); - sequences.add(sequence); - } - } - } - - return sequences; - } - - private void createDeBruijnGraph(List reads) { - - // create the graph - createGraphFromSequences(reads); - - // remove nodes with incoming multiplicity of N - // if ( MIN_MULTIPLICITY_TO_USE > 0 ) - // removeNodesWithLowMultiplicity(); - - // cleanup graph by merging nodes - concatenateNodes(); - - // cleanup the node sequences so that they print well - cleanupNodeSequences(); - - if ( DEBUG ) - printGraph(); - } - - private void createGraphFromSequences(List reads) { - - for ( byte[] sequence : reads ) { - - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for (int i = 0; i < kmersInSequence - 1; i++) { - // get the kmers - byte[] kmer1 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH); - byte[] kmer2 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - - addEdgeToGraph(kmer1, kmer2); - - // TODO -- eventually, we'll need to deal with reverse complementing the sequences - } - } - } - - private void addEdgeToGraph(byte[] kmer1, byte[] kmer2) { - - DeBruijnVertex v1 = addToGraphIfNew(kmer1); - DeBruijnVertex v2 = addToGraphIfNew(kmer2); - - Set edges = graph.outgoingEdgesOf(v1); - DeBruijnEdge targetEdge = null; - for ( DeBruijnEdge edge : edges ) { - if ( graph.getEdgeTarget(edge).equals(v2) ) { - targetEdge = edge; - break; - } - } - - if ( targetEdge == null ) - graph.addEdge(v1, v2, new DeBruijnEdge()); - else - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1); - } - - private DeBruijnVertex addToGraphIfNew(byte[] kmer) { - - // the graph.containsVertex() method is busted, so here's a hack around it - DeBruijnVertex newV = new DeBruijnVertex(kmer); - for ( DeBruijnVertex v : graph.vertexSet() ) { - if ( v.equals(newV) ) - return v; - } - - graph.addVertex(newV); - return newV; - } - - private void concatenateNodes() { - - while ( true ) { - boolean graphWasModified = false; - - Set vertexSet = graph.vertexSet(); - // convert to array because results of the iteration on a set are undefined when the graph is modified - ArrayList vertices = new ArrayList(vertexSet); - - for (int i = 0; i < vertices.size(); i++) { - - DeBruijnVertex v1 = vertices.get(i); - - // try to merge v1 -> v2 - if ( graph.outDegreeOf(v1) == 1 ) { - DeBruijnEdge edge = graph.outgoingEdgesOf(v1).iterator().next(); - DeBruijnVertex v2 = graph.getEdgeTarget(edge); - - if ( graph.inDegreeOf(v2) == 1 ) { - mergeVertices(v1, v2); - graphWasModified = true; - break; - } - } - - // try to merge v2 -> v1 - if ( graph.inDegreeOf(v1) == 1 ) { - DeBruijnEdge edge = graph.incomingEdgesOf(v1).iterator().next(); - DeBruijnVertex v2 = graph.getEdgeSource(edge); - - if ( graph.outDegreeOf(v2) == 1 ) { - mergeVertices(v2, v1); - graphWasModified = true; - break; - } - } - } - - if ( !graphWasModified ) - break; - } - } - - private void mergeVertices(DeBruijnVertex V1, DeBruijnVertex V2) { - // (Vx -> V1 -> V2 -> Vy) - // should now be - // (Vx -> V12 -> Vy) - - // create V12 - int additionalSequenceFromV2 = V2.actualSequence.length - KMER_LENGTH + 1; - byte[] newKmer = new byte[V1.actualSequence.length + additionalSequenceFromV2]; - System.arraycopy(V1.actualSequence, 0, newKmer, 0, V1.actualSequence.length); - System.arraycopy(V2.actualSequence, KMER_LENGTH - 1, newKmer, V1.actualSequence.length, additionalSequenceFromV2); - DeBruijnVertex V12 = new DeBruijnVertex(newKmer); - graph.addVertex(V12); - - // copy edges coming from Vx to V12 - Set Ex = graph.incomingEdgesOf(V1); - for ( DeBruijnEdge edge : Ex ) { - DeBruijnVertex Vx = graph.getEdgeSource(edge); - DeBruijnEdge newEdge = new DeBruijnEdge(); - newEdge.setMultiplicity(edge.getMultiplicity()); - graph.addEdge(Vx, V12, newEdge); - } - - // copy edges going to Vy from V12 - Set Ey = graph.outgoingEdgesOf(V2); - for ( DeBruijnEdge edge : Ey ) { - DeBruijnVertex Vy = graph.getEdgeTarget(edge); - DeBruijnEdge newEdge = new DeBruijnEdge(); - newEdge.setMultiplicity(edge.getMultiplicity()); - graph.addEdge(V12, Vy, newEdge); - } - - // remove V1 and V2 and their associated edges - graph.removeVertex(V1); - graph.removeVertex(V2); - } - - private void cleanupNodeSequences() { - - // remove the first k-1 bases of the kmers - for ( DeBruijnVertex v : graph.vertexSet() ) { - if ( graph.inDegreeOf(v) > 0 ) - v.removePrefix(KMER_LENGTH - 1, true); - } - - // move common suffixes from incoming nodes to this one - - while ( true ) { - - boolean graphWasModified = false; - for ( DeBruijnVertex v : graph.vertexSet() ) { - - if ( graph.inDegreeOf(v) > 1 ) { - Set connectedVs = new HashSet(); - for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) ) - connectedVs.add(graph.getEdgeSource(edge)); - - if ( propagateCommonSuffix(v, connectedVs) ) { - removeEmptyNodes(); - graphWasModified = true; - break; - } - } - } - - if ( !graphWasModified ) - break; - } - } - - private void removeEmptyNodes() { - - // remember that results of an iteration on a set are undefined when the graph is modified - while ( true ) { - - boolean graphWasModified = false; - for ( DeBruijnVertex v : graph.vertexSet() ) { - if ( v.printableSequence.length == 0 ) { - removeNode(v); - graphWasModified = true; - break; - } - } - - if ( !graphWasModified ) - break; - } - } - - private void removeNode(DeBruijnVertex v) { - Set incoming = graph.incomingEdgesOf(v); - Set outgoing = graph.outgoingEdgesOf(v); - - // make edges from all incoming nodes to all outgoing nodes - for ( DeBruijnEdge Ex : incoming ) { - DeBruijnVertex Vx = graph.getEdgeSource(Ex); - for ( DeBruijnEdge Ey : outgoing ) { - DeBruijnVertex Vy = graph.getEdgeTarget(Ey); - - DeBruijnEdge newEdge = new DeBruijnEdge(); - newEdge.setMultiplicity(Ex.getMultiplicity()); - graph.addEdge(Vx, Vy, newEdge); - } - } - - // remove v and its associated edges - graph.removeVertex(v); - } - - private boolean propagateCommonSuffix(DeBruijnVertex Vx, Set incoming) { - - // find the common matching suffix - byte[] match = null; - for ( DeBruijnVertex v : incoming ) { - if ( match == null ) { - match = v.printableSequence; - } else { - int idx = 0; - while ( idx < match.length && idx < v.printableSequence.length && match[match.length - idx - 1] == v.printableSequence[v.printableSequence.length - idx - 1] ) - idx++; - - if ( idx < match.length ) { - match = new byte[idx]; - System.arraycopy(v.printableSequence, v.printableSequence.length - idx, match, 0, idx); - } - } - } - - // if there is a common suffix... - if ( match != null && match.length > 0 ) { - - // remove the suffix from the end of the incoming nodes... - for ( DeBruijnVertex v : incoming ) - v.removeSuffix(match.length, false); - - // ...and put it at the front of this node - Vx.addPrefix(match, false); - return true; - } - - return false; - } - - private void printGraph() { - - for ( DeBruijnVertex source : graph.vertexSet() ) { - if ( graph.inDegreeOf(source) == 0 ) - getOutputStream().print("* "); - getOutputStream().print(source + " -> "); - for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) { - getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), "); - } - getOutputStream().println(); - } - getOutputStream().println("------------\n"); - } - - private void findBestPaths() { - - // find them - List bestPaths = KBestPaths.getKBestPaths(graph, 2); - - // print them out - for ( KBestPaths.Path path : bestPaths ) { - - List edges = path.getEdges(); - for (int i = 0; i < edges.size(); i++) { - - DeBruijnEdge edge = edges.get(i); - - if ( i == 0 ) - getOutputStream().print(graph.getEdgeSource(edge)); - - getOutputStream().print(graph.getEdgeTarget(edge)); - } - - if ( edges.size() == 0 ) - getOutputStream().print(path.getLastVertexInPath()); - - getOutputStream().println(" (score=" + path.getScore() + ", lowestEdge=" + path.getLowestEdge() +")"); - } - } - - private void assignReadsToGraph(List reads) { - - // TODO -- implement me - - } - - /**** - private void removeNodesWithLowMultiplicity() { - - Set vertexSet = graph.vertexSet(); - // convert to array because results of the iteration on a set are undefined when the graph is modified - ArrayList vertices = new ArrayList(vertexSet); - - for (int i = 0; i < vertices.size(); i++) { - - DeBruijnVertex v = vertices.get(i); - if ( graph.inDegreeOf(v) == 1 && - graph.incomingEdgesOf(v).iterator().next().getMultiplicity() < MIN_MULTIPLICITY_TO_USE ) - removeNode(v); - } - } - ****/ -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java deleted file mode 100755 index 080e19bea..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.assembly; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; - -import java.io.*; -import java.util.*; - -/** - * Performs local assembly. Not to be used yet. Example: java -jar dist/GenomeAnalysisTK.jar -I /seq/picard_aggregation/EXT1/NA12878/v3/NA12878.bam -R /humgen/1kg/reference/human_g1k_v37.fasta -T WindowedAssembly -et NO_ET -o foo.out -B:variant,vcf AssemblyTestAlleles.vcf -BTI variant - */ -public class WindowedAssemblyWalker extends ReadWalker { - - protected static final int MIN_MAPPING_QUALITY = 20; - - - @Output(doc="Base-space graph output", required=true) - protected PrintStream graphWriter = null; - - @Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false) - protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; - - @Hidden - @Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use", required = false) - protected int numReadsToUse = -1; - - // the assembly engine - LocalAssemblyEngine assemblyEngine = null; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - - // the reads that fall into the current interval - private final ArrayList readsToAssemble = new ArrayList(); - - - public void initialize() { - - IndexedFastaSequenceFile referenceReader; - try { - // fasta reference reader to supplement the edges of the reference sequence - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); - } - - assemblyEngine = makeAssembler(ASSEMBLER_TO_USE, referenceReader); - - GenomeLocSortedSet intervalsToAssemble = getToolkit().getIntervals(); - if ( intervalsToAssemble == null || intervalsToAssemble.isEmpty() ) - throw new UserException.BadInput("Intervals must be provided with -L or -BTI (preferably not larger than several hundred bp)"); - - intervals = intervalsToAssemble.clone().iterator(); - currentInterval = intervals.hasNext() ? intervals.next() : null; - } - - private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) { - switch ( type ) { - case SIMPLE_DE_BRUIJN: - return new SimpleDeBruijnAssembler(graphWriter, referenceReader, numReadsToUse); - default: - throw new UserException.BadInput("Assembler type " + type + " is not valid/supported"); - } - } - - public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - return currentInterval == null || doNotTryToAssemble(read) ? null : read; - } - - private boolean doNotTryToAssemble(SAMRecord read) { - return read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getDuplicateReadFlag() || - read.getMappingQuality() < MIN_MAPPING_QUALITY; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(SAMRecord read, Integer sum) { - if ( read == null ) - return sum; - - GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.overlapsP(currentInterval) ) { - readsToAssemble.add(read); - } else { - assemblyEngine.runLocalAssembly(readsToAssemble); - readsToAssemble.clear(); - sum++; - - do { - currentInterval = intervals.hasNext() ? intervals.next() : null; - } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); - } - - return sum; - } - - public void onTraversalDone(Integer result) { - if ( readsToAssemble.size() > 0 ) { - assemblyEngine.runLocalAssembly(readsToAssemble); - result++; - } - logger.info("Ran local assembly on " + result + " intervals"); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java deleted file mode 100755 index 5bcfdd80f..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.contamination; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.playground.utils.NamedTable; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; - -import cern.jet.stat.Probability; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.io.PrintStream; - -/** - * FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear - * to be contaminants (meaning a read group that's not actually associated with the sample) by searching - * for evidence of systematic underperformance at likely homozygous-variant sites. - * - * @author Kiran Garimella - */ -public class FindContaminatingReadGroupsWalker extends LocusWalker { - @Output - private PrintStream out; - - @Argument(fullName="balance", shortName="bal", doc="The expected alternate allele balance for homozygous-variant sites", required=false) - private Double BALANCE = 0.95; - - @Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false) - private Double LIMIT = 1e-9; - - @Argument(fullName="scaleForSample", shortName="scale", doc="the scale by which the pvalue limit should reduce for testing samples directly. "+ - "E.g. if a sample has three 1e-3 read groups, pvalue is 1e-9 -- significant; so the scale should reduce by some multiplicative factor"+ - "For each read group associated with the sample. Defaults to 1e-4 [1e-9 for 1 RG, 1e-13 for 2 RG, 1e-17 for 3, etc]", required=false) - private Double SCALE = 1e-4; - - private UnifiedGenotyperEngine ug; - private NamedTable altTable; - private final double EPSILON = 1e-20; - - public void initialize() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.STANDARD_CONFIDENCE_FOR_CALLING = uac.STANDARD_CONFIDENCE_FOR_EMITTING = 50.0; - ug = new UnifiedGenotyperEngine(getToolkit(), uac); - - altTable = new NamedTable(); - } - - /** - * Identify likely homozygous-variant sites that are called as - * heterozygous, so that we can isolate our inspection to these sites. - * - * @param tracker the meta-data tracker - * @param ref information regarding the reference - * @param context information regarding the reads - * @return true if this site is a suspicious het, false if otherwise - */ - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - int altCount = 0; - int totalCount = 0; - - ReadBackedPileup pileup = context.getBasePileup(); - int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); - - for (byte base : pileup.getBases() ) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex((char) base); - - if (baseIndex != refIndex) { - altCount++; - } - totalCount++; - } - - double altBalance = ((double) altCount)/((double) totalCount); - - if (altBalance > 0.70) { - VariantCallContext ugResult = ug.calculateLikelihoodsAndGenotypes(tracker, ref, context); - - if (ugResult != null && ugResult.getNSamples() > 0) { - return ugResult.getGenotype(0).isHet(); - } - } - - return false; - } - - /** - * For each read group represented in the pileup, determine the fraction of bases supporting the alternate allele - * - * @param tracker the meta-data tracker - * @param ref information regarding the reference - * @param context information regarding the reads - * @return 1 - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - NamedTable alleleCounts = new NamedTable(); - - int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); - String colName = String.format("%s.%d", context.getContig(), context.getPosition()); - - for (int i = 0; i < context.size(); i++) { - SAMRecord read = context.getReads().get(i); - int offset = context.getOffsets().get(i); - - SAMReadGroupRecord rg = read.getReadGroup(); - int alleleIndex = BaseUtils.simpleBaseToBaseIndex((char) read.getReadBases()[offset]); - - alleleCounts.increment(rg.getReadGroupId(), (alleleIndex == refIndex) ? "ref" : "alt"); - } - - for (String rg : alleleCounts.getRowNames()) { - double altCount = alleleCounts.get(rg, "alt"); - double refCount = alleleCounts.get(rg, "ref"); - - altTable.set(rg, colName, altCount / (altCount + refCount)); - } - - return 1; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return null; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return null; - } - - /** - * Perform the t-test and list the read groups that are significant underperformers. - * - * @param result the number of suspicious sites we're inspecting (this argument is ignored) - */ - public void onTraversalDone(Integer result) { - //out.println("readgroup\tpvalue\tstatus\tbalances"); - out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances"); - - HashMap pvalByReadGroup = new HashMap(); - for (String rg : altTable.getRowNames()) { - String balances = ""; - - // Compute mean - double sum = 0.0, total = 0.0; - - for (String locus : altTable.getColumnNames()) { - double value = altTable.get(rg, locus); - - sum += value; - total += 1.0; - - balances += String.format("%2.2f,", value); - } - - double mean = sum/total; - - // Compute stdev - double squareSumOfMeanDifferences = 0.0; - - for (String locus : altTable.getColumnNames()) { - double value = altTable.get(rg, locus); - - squareSumOfMeanDifferences += Math.pow(value - mean, 2.0); - } - - double stdev = Math.sqrt(squareSumOfMeanDifferences/total); - - // Compute standard error of the mean (SEM) - double sem = stdev/Math.sqrt(total); - - // Compute test statistic t - double t = (mean - BALANCE) / sem; - - // Degrees of freedom - double dof = total - 1.0; - - // Compute pValue - double pValue = Probability.studentT(dof, t); - pValue = pValue < EPSILON ? EPSILON : pValue; - pvalByReadGroup.put(rg,pValue); - - //out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances); - out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n", - rg, - String.format("%e", pValue), - (pValue < LIMIT ? "aberrant" : "nominal"), - balances); - - logger.debug(rg); - } - - out.printf("%n%n%s%n","SECTION ON BADLY CONTAMINATED SAMPLES"); - out.printf("%s\t%s\t%s\t%s%n","sample","p-value","status","info"); - - HashMap> samplesToReadGroups = new HashMap>(); - for ( SAMReadGroupRecord rec : getToolkit().getSAMFileHeader().getReadGroups() ) { - if ( samplesToReadGroups.containsKey(rec.getSample()) ) { - samplesToReadGroups.get(rec.getSample()).add(rec.getReadGroupId()); - } else { - ArrayList newList = new ArrayList(); - newList.add(rec.getReadGroupId()); - samplesToReadGroups.put(rec.getSample(),newList); - } - } - - for ( String sample : samplesToReadGroups.keySet() ) { - double p_value = 1; - double limit = LIMIT; - boolean containsAberrantReads = false; - for ( String rg : samplesToReadGroups.get(sample) ) { - double rg_pval = ( pvalByReadGroup.get(rg) == null ? 1 : pvalByReadGroup.get(rg) ); - p_value = p_value*rg_pval; - containsAberrantReads = containsAberrantReads || rg_pval < LIMIT; - limit = limit*SCALE; - logger.debug(rg); - } - - out.printf("%s\t%-13s\t%s\t%s%n", sample, String.format("%e",p_value), ( p_value < limit ? "aberrant" : "nominal"), ( containsAberrantReads ? "contains_aberrant_RG" : "no_aberrant_RG")); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ComputeConfusionMatrix.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ComputeConfusionMatrix.java deleted file mode 100755 index 11a4ab70e..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ComputeConfusionMatrix.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.BaseUtils; -import net.sf.samtools.SAMRecord; - -import java.util.HashMap; -import java.util.Arrays; -import java.util.Hashtable; -import java.io.PrintStream; - -/** - * Computes empirical base confusion matrix, and optionally computes - * these matrices with up to five bases of preceding context - */ -@Reference(window=@Window(start=-5,stop=5)) -public class ComputeConfusionMatrix extends LocusWalker { - @Output - protected PrintStream out; - - @Argument(fullName="minimumDepth", shortName="minDepth", doc="Require locus pileup to have specified minimum depth (default: 10)", required=false) - public Integer MIN_DEPTH = 10; - - @Argument(fullName="maximumDepth", shortName="maxDepth", doc="Require locus pileup to have specified maximum depth (default: 100)", required=false) - public Integer MAX_DEPTH = 100; - - @Argument(fullName="contextWindowSize", shortName="window", doc="Size of context window (default: 0)", required=false) - public Integer WINDOW_SIZE = 0; - - private Hashtable confusionCounts = new Hashtable(); - - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - int pileupSize = context.size(); - - int numAlts = 0; - int[] baseCounts = context.getBasePileup().getBaseCounts(); - for (int baseIndex = 0; baseIndex < baseCounts.length; baseIndex++) { - if (baseIndex != ref.getBaseIndex()) { - numAlts += baseCounts[baseIndex]; - } - } - - return ( - pileupSize >= MIN_DEPTH && // don't process regions without a reasonable pileup - pileupSize < MAX_DEPTH && // don't process suspiciously overcovered regions - ref.getBases().length % 2 == 1 && // don't process regions that don't have a full context window - numAlts == 1 && // don't process regions that have more than one mismatching base - ref.getBaseIndex() >= 0 // don't process a locus with an ambiguous reference base - ); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - int windowLength = ref.getBases().length; - int windowCenter = (windowLength - 1)/2; - - String fwRefBases = new String(ref.getBases()); - String fwRefBase = String.format("%c", ref.getBaseAsChar()); - String fwWindowLeft = fwRefBases.substring(windowCenter - WINDOW_SIZE, windowCenter); - - //String rcRefBases = new String(BaseUtils.simpleReverseComplement(ref.getBases())); - //String rcRefBase = String.format("%c", BaseUtils.simpleComplement(ref.getBase())); - //String rcWindowRight = rcRefBases.substring(windowCenter + 1, windowCenter + 1 + WINDOW_SIZE); - - int[] baseCounts = context.getBasePileup().getBaseCounts(); - int altBaseIndex = -1; - for (int baseIndex = 0; baseIndex < 4; baseIndex++) { - if (baseCounts[baseIndex] == 1) { - altBaseIndex = baseIndex; - } - } - - String fwAltBase = String.format("%c", (char)BaseUtils.baseIndexToSimpleBase(altBaseIndex)); - //String rcAltBase = BaseUtils.simpleComplement(fwAltBase); - - for (int readIndex = 0; readIndex < context.getReads().size(); readIndex++) { - SAMRecord read = context.getReads().get(readIndex); - int offset = context.getOffsets().get(readIndex); - - char base = read.getReadString().charAt(offset); - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - - if (baseIndex == altBaseIndex) { - if (read.getReadNegativeStrandFlag()) { - //incrementConfusionCounts(rcWindowRight, rcRefBase, rcAltBase); - } else { - incrementConfusionCounts(fwWindowLeft, fwAltBase, fwRefBase); - } - } - } - - return null; - } - - private void incrementConfusionCounts(String context, String altBase, String refBase) { - String key = String.format("%s:%s:%s", context, altBase.toUpperCase(), refBase.toUpperCase()); - - Integer counts = confusionCounts.get(key); - if (counts == null) { counts = 0; } - - confusionCounts.put(key, counts + 1); - } - - public Integer reduceInit() { - return null; - } - - public Integer reduce(Integer value, Integer sum) { - return null; - } - - public void onTraversalDone(Integer result) { - String[] keys = confusionCounts.keySet().toArray(new String[0]); - Arrays.sort(keys); - - HashMap contextualNorms = new HashMap(); - for (String key : keys) { - String[] fields = key.split(":"); - - String contextualKey = String.format("%s:%s", fields[0], fields[1]); - Integer contextualCount = contextualNorms.get(contextualKey); - if (contextualCount == null) { contextualCount = 0; } - contextualNorms.put(contextualKey, contextualCount + confusionCounts.get(key)); - } - - out.printf("context\talt\tref\tcounts\ttotal\tfraction\n"); - for (String key : keys) { - String[] fields = key.split(":"); - String contextualKey = String.format("%s:%s", fields[0], fields[1]); - - out.printf( - "%s\t%s\t%s\t%d\t%d\t%f\n", - fields[0].isEmpty() ? "." : fields[0], - fields[1], - fields[2], - confusionCounts.get(key), - contextualNorms.get(contextualKey), - confusionCounts.get(key)/((float) contextualNorms.get(contextualKey)) - ); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ErrorRatePerCycle.java deleted file mode 100755 index af69c4cef..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.io.PrintStream; -import java.util.List; - -/** - * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine) - */ -public class ErrorRatePerCycle extends LocusWalker { - @Output PrintStream out; - @Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling (default: 0)", required=false) public Integer MIN_BASE_QUAL = 0; - @Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling (default: 0)", required=false) public Integer MIN_MAPPING_QUAL = 0; - - private GATKReport report; - private String reportName = "ErrorRatePerCycle"; - private String reportDescription = "The error rate per sequenced position in the reads"; - - public void initialize() { - report = new GATKReport(); - - report.addTable(reportName, reportDescription); - report.getTable(reportName).addPrimaryKey("cycle"); - - for (SAMReadGroupRecord rg : this.getToolkit().getSAMFileHeader().getReadGroups()) { - String readGroupId = rg.getReadGroupId(); - - report.getTable(reportName).addColumn("mismatches." + readGroupId, 0, false); - report.getTable(reportName).addColumn( "qualsum." + readGroupId, 0, false); - report.getTable(reportName).addColumn( "counts." + readGroupId, 0, false); - report.getTable(reportName).addColumn( "errorrate." + readGroupId, 0.0f); - report.getTable(reportName).addColumn( "qualavg." + readGroupId, 0.0f); - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List offsets = context.getOffsets(); - List reads = context.getReads(); - - for (int i = 0; i < offsets.size(); i++) { - int offset = offsets.get(i); - - if (reads.get(i).getMappingQuality() >= MIN_MAPPING_QUAL && reads.get(i).getBaseQualities()[offset] >= MIN_BASE_QUAL) { - char readBase = reads.get(i).getReadString().charAt(offset); - - int refIndex = ref.getBaseIndex(); - int readIndex = BaseUtils.simpleBaseToBaseIndex(readBase); - - if (!reads.get(i).getReadNegativeStrandFlag() && (!reads.get(i).getReadPairedFlag() || reads.get(i).getFirstOfPairFlag())) { - String readGroupId = reads.get(i).getReadGroup().getReadGroupId(); - - if (refIndex != readIndex) { - report.getTable(reportName).increment(offset, "mismatches." + readGroupId); - } - report.getTable(reportName).add(offset, "qualsum." + readGroupId, (int) reads.get(i).getBaseQualities()[offset]); - report.getTable(reportName).increment(offset, "counts." + readGroupId); - } - } - } - - return null; - } - - public Integer reduceInit() { return null; } - - public Integer reduce(Integer value, Integer sum) { return null; } - - public void onTraversalDone(Integer sum) { - for (SAMReadGroupRecord rg : this.getToolkit().getSAMFileHeader().getReadGroups()) { - String readGroupId = rg.getReadGroupId(); - - report.getTable(reportName).divideColumns("errorrate." + readGroupId, "mismatches." + readGroupId, "counts." + readGroupId); - report.getTable(reportName).divideColumns( "qualavg." + readGroupId, "qualsum." + readGroupId, "counts." + readGroupId); - } - - report.print(out); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/MatePairLibrarySize.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/MatePairLibrarySize.java deleted file mode 100755 index 2df6b8b06..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/MatePairLibrarySize.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.commandline.Argument; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; -import java.io.File; -import java.io.PrintWriter; -import java.io.IOException; - -/** - * For each sequencing library, outputs the distribution of mate pair sizes - */ -public class MatePairLibrarySize extends ReadWalker { - @Argument(fullName="outdir", shortName="outdir", doc="Directory to output results") - private File OUT_DIR; - - private HashMap> matePairSize; - - public void initialize() { - matePairSize = new HashMap>(); - - for (SAMReadGroupRecord rg : this.getToolkit().getSAMFileHeader().getReadGroups()) { - HashMap mps = new HashMap(); - - matePairSize.put(rg.getLibrary(), mps); - } - } - - public boolean filter(ReferenceContext ref, SAMRecord read) { - return (read.getReadPairedFlag() && read.getFirstOfPairFlag()); - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - int insert = read.getInferredInsertSize(); - - Integer oldcount = matePairSize.get(read.getReadGroup().getLibrary()).get(insert); - if (oldcount == null) { oldcount = 0; } - - matePairSize.get(read.getReadGroup().getLibrary()).put(insert, oldcount + 1); - - return null; - } - - public Integer reduceInit() { - return null; - } - - public Integer reduce(Integer value, Integer sum) { - return null; - } - - public void onTraversalDone(Integer sum) { - String[] libraries = matePairSize.keySet().toArray(new String[1]); - - for (String library : libraries) { - File file = new File(String.format("%s/%s.pairdist", OUT_DIR.getAbsolutePath(), library)); - try { - Integer[] sizes = matePairSize.get(library).keySet().toArray(new Integer[1]); - - if (sizes != null && sizes.length > 1) { - PrintWriter pw = new PrintWriter(file); - Arrays.sort(sizes); - - pw.printf("%s\t%s%n", "insert", "frequency"); - - for (int insert : sizes) { - if (insert >= 0) { - pw.printf("%d\t%d%n", insert, matePairSize.get(library).get(insert)); - } - } - - pw.close(); - } - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(file, e); - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/QualityScoreDistribution.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/QualityScoreDistribution.java deleted file mode 100755 index 8be94d143..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/QualityScoreDistribution.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import net.sf.samtools.SAMRecord; - -import java.util.*; -import java.io.PrintStream; - -/** - * Compute quality score distribution - */ -public class QualityScoreDistribution extends LocusWalker { - @Output - PrintStream out; - - private HashMap qualDists; - - public void initialize() { - qualDists = new HashMap(); - - qualDists.put("all", new long[QualityUtils.MAX_QUAL_SCORE]); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - - for (int i = 0; i < reads.size(); i++) { - byte qual = reads.get(i).getBaseQualities()[offsets.get(i)]; - String name = reads.get(i).getReadGroup().getReadGroupId(); - - if (!qualDists.containsKey(name)) { - qualDists.put(name, new long[QualityUtils.MAX_QUAL_SCORE]); - } - - qualDists.get(name)[qual]++; - qualDists.get("all")[qual]++; - } - - return null; - } - - public Integer reduceInit() { - return null; - } - - public Integer reduce(Integer value, Integer sum) { - return null; - } - - public void onTraversalDone(Integer result) { - Set names = qualDists.keySet(); - HashMap norms = new HashMap(); - - for (String name : names) { - long norm = 0; - for (int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE; qual++) { - norm += qualDists.get(name)[qual]; - } - - norms.put(name, norm); - } - - out.printf("Q"); - for (String name : names) { - out.printf("\t%s", name); - } - out.println(); - - for (int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE; qual++) { - out.printf("%d", qual); - - for (String name : names) { - out.printf("\t%f", ((float) qualDists.get(name)[qual])/((float) norms.get(name))); - } - - out.println(); - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/SNPDensity.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/SNPDensity.java deleted file mode 100755 index 37745c003..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/SNPDensity.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.diagnostics; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.EnumSet; -import java.io.PrintStream; - -/** - * Computes the density of SNPs passing and failing filters in intervals on the genome and emits a table for display - */ -@By(DataSource.REFERENCE) -@Requires(value={},referenceMetaData=@RMD(name="eval",type=VariantContext.class)) -public class SNPDensity extends RefWalker, SNPDensity.Counter> { - @Output - private PrintStream out; - - @Argument(fullName="granularity", shortName="granularity", doc="", required=false) - private int granularity = 1000000; - - public void initialize() { - out.printf("chr middlePos linearPos nSNPs nSNPsFiltered unfiltered.density filtered.density%n"); - } - - public class Counter { - GenomeLoc firstLoc = null; - long linearOffset = 0; - int nSNPsCalled = 0; - int nSNPsFiltered = 0; - - public Counter(Long linearOffset) { - this.linearOffset = linearOffset; - - //System.out.printf("linear offset %d%n", linearOffset); - } - } - - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - VariantContext vc = tracker.getVariantContext(ref, "eval", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), false); - return new Pair(vc, context.getLocation()); - } - - public Counter reduceInit() { - return new Counter(0L); - } - - private void printLine(Counter sum) { - long offset = granularity / 2 - 1; - long chrOffset = sum.firstLoc.getStart() + offset; - out.printf("%s %d %d %d %d %.2e %.2e%n", - sum.firstLoc.getContig(), - chrOffset, - sum.linearOffset + offset, - sum.nSNPsCalled, sum.nSNPsFiltered, - (1.0 * sum.nSNPsCalled) / granularity, (1.0 * sum.nSNPsFiltered) / granularity); - } - - public Counter reduce(Pair p, Counter sum) { - if ( p == null ) - return sum; - -// System.out.printf("%s %s %d%n", c.getLocation(), sum.firstLoc, sum.nSNPsSeen); - VariantContext c = p.getFirst(); - GenomeLoc loc = p.getSecond(); - - if ( sum.firstLoc != null ) { - long dist = loc.distance(sum.firstLoc); -// System.out.printf(" dist = %d%n", dist); - if ( dist > granularity ) { - printLine(sum); - sum = new Counter(sum.linearOffset + granularity); - } - } - - if ( sum.firstLoc == null ) sum.firstLoc = loc; - - sum.nSNPsCalled += c != null && c.isNotFiltered() ? 1 : 0; - sum.nSNPsFiltered += c != null && c.isFiltered() ? 1 : 0; - - return sum; - } - - public void onTraversalDone(Counter sum) { - printLine(sum); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java deleted file mode 100644 index 6cbe725c7..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.duplicates; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.duplicates.DupUtils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import java.util.Set; -import java.util.ArrayList; -import java.io.PrintStream; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; - -/** - * Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to - * the specified output BAM location. If no output location is specified, the reads are written to STDOUT. - */ -public class CombineDuplicatesWalker extends DuplicateWalker, SAMFileWriter> { - @Output - public PrintStream out; - - @Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to") - public SAMFileWriter outputBAM = null; - - @Argument(fullName="maxQ", shortName="maxQ", required=false, - doc="The maximum Q score allowed for combined reads, reflects the background error rate giving rise to perfect bases that don't correspond to the reference") - public int MAX_QUALITY_SCORE = 50; - - /** - * start the walker with the command line argument specified SAMFileWriter - * @return a sam file writer, which may be null - */ - public SAMFileWriter reduceInit() { - return outputBAM; - } - - /** - * emit the read that was produced by combining the dupplicates - */ - public SAMFileWriter reduce(List reads, SAMFileWriter output) { - for ( SAMRecord read : reads ) { - if ( output != null ) { - output.addAlignment(read); - } else { - out.println(read.format()); - } - } - - return output; - } - - - /** - * when we're done, print out the collected stats - * @param result the result of the traversal engine, to be printed out - */ - public void onTraversalDone(SAMFileWriter result) { - return; // don't do anything - } - - - /** - * Build a combined read given the input list of non-unique reads. If there's just one read in the - * set, it's considered unique and returned. If there's more than one, the N-way combine - * duplicate function is invoked. - * - * @param loc the genome loc - * @param context the alignment context that has the reads information - * @param readSets the set of unique reads list at this locus - * @return a read that combines the dupplicate reads at this locus - */ - public List map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { - List combinedReads = new ArrayList(); - - for ( List reads : readSets ) { - SAMRecord combinedRead = null; - - if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) { - // we are a unique read - combinedRead = reads.get(0); - } else { - // actually call the combine function -// for (SAMRecord read : reads ) { -// out.printf("Combining Read %s%n", read.format()); -// } -// - combinedRead = DupUtils.combineDuplicates(getToolkit().getGenomeLocParser(),reads, MAX_QUALITY_SCORE); - //out.printf(" => into %s%n", combinedRead.format()); - } - - if ( combinedRead.getDuplicateReadFlag() ) - throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s", - combinedRead.getReadName(), reads.size(), combinedRead.format())); - - combinedReads.add(combinedRead); - } - - return combinedReads; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java deleted file mode 100644 index 522fdad13..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.duplicates; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.List; -import java.util.Set; -import java.util.ArrayList; -import java.io.PrintStream; - -/** - * a class to store the traversal information we pass around - */ -class DuplicateCount { - public int count = 0; // the count of sites we were given - public int nUniqueMolecules = 0; // the unique read count - public int nDuplicatedMolecules = 0; // the unique read count - public int depth = 0; // the dupplicate read depth -} - -/** - * Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions. - * @author mark DePristo - */ -public class CountDuplicatesWalker extends DuplicateWalker { - @Output - PrintStream out; - - @Argument(fullName="quietLocus", required=false, doc="If true, per locus information isn't printed") - public boolean quiet = false; - - /** - * the map function, conforming to the duplicates interface - * @param loc the genomic location - * @param context the AlignmentContext, containing all the reads overlapping this region - * @param readSets all the duplicate reads - * @return a DuplicateCount object, with the appropriate stats - */ - public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { - if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size()); - - DuplicateCount dup = new DuplicateCount(); - dup.depth = 0; - for ( List reads : readSets) { - List names = new ArrayList(); - for ( SAMRecord read : reads ) { - names.add(read.getReadName()); - } - if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names)); - dup.depth += reads.size(); - dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0; // if there's more than 1 read per set, we're a duplicated reads - } - if ( ! quiet ) out.printf("%n"); - - dup.count = 1; - dup.nUniqueMolecules = readSets.size(); - return dup; - } - - public boolean mapAtLociWithoutDuplicates() { return true; } - - /** - * setup our walker. In this case, new a DuplicateCount object and return it - * @return the object holding the counts of the duplicates - */ - public DuplicateCount reduceInit() { - return new DuplicateCount(); - } - - /** - * the reduce step. This function combines the DuplicateCount objects, and updates the running depth average - * @param value the new DuplicateCount - * @param sum the running sum DuplicateCount - * @return a new DuplicateCount with the updated sums - */ - public DuplicateCount reduce(DuplicateCount value, DuplicateCount sum) { - DuplicateCount dup = new DuplicateCount(); - dup.count = sum.count + value.count; - dup.depth = value.depth + sum.depth; - dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules; - dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules; - return dup; - } - - /** - * when we're done, print out the collected stats - * @param result the result of the traversal engine, to be printed out - */ - public void onTraversalDone(DuplicateCount result) { - out.println("[REDUCE RESULT] Traversal result is: "); - out.println("traversal iterations = " + result.count); - out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count); - out.println("unique molecules seen = " + result.nUniqueMolecules); - out.println("duplicated molecules seen = " + result.nDuplicatedMolecules); - out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100); - out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/package-info.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/package-info.java deleted file mode 100644 index 7ced4fc8e..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - * @help.display.name Miscellaneous walkers (experimental) - */ -package org.broadinstitute.sting.playground.gatk.walkers; \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/PrintReferenceVariantsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/PrintReferenceVariantsWalker.java deleted file mode 100755 index eeaa3ff41..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/PrintReferenceVariantsWalker.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.playground.gatk.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.phasing.WriteVCF; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - -/** - * At each locus in the input data set, prints the reference in VCF format. - */ - -@By(DataSource.REFERENCE) -@Requires({DataSource.REFERENCE, DataSource.REFERENCE_BASES}) - -public class PrintReferenceVariantsWalker extends LocusWalker { - @Output(doc = "File to which reference variants should be written", required = true) - protected VCFWriter writer = null; - - private final static String REFERENCE = "REFERENCE"; - private final static double REF_NEG_LOG_10_P_ERROR = 9.9; - - private String REF_FILE_NAME = null; - - public void initialize() { - this.REF_FILE_NAME = getToolkit().getArguments().referenceFile.getName(); - - initializeVcfWriter(); - } - - private void initializeVcfWriter() { - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", REF_FILE_NAME)); - - Set samples = new TreeSet(); - samples.add(REFERENCE); - writer.writeHeader(new VCFHeader(hInfo, samples)); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (ref == null) - return 0; - - GenomeLoc refLoc = ref.getLocus(); - - Allele refAllele = Allele.create(ref.getBase(), true); // create a single-base allele - Set alleles = new HashSet(); - alleles.add(refAllele); - - Map genotypes = new HashMap(); - boolean isPhased = true; // trivially true for a haploid genotype - Genotype haploidRefGt = new Genotype(REFERENCE, new LinkedList(alleles), VCFConstants.MAX_GENOTYPE_QUAL, new HashSet(), new HashMap(), isPhased); - genotypes.put(REFERENCE, haploidRefGt); - - // Ensure that the genotype refers to alleles of length 1 (by using refLoc.getStart() as the stop position): - VariantContext vc = new VariantContext(REF_FILE_NAME, refLoc.getContig(), refLoc.getStart(), refLoc.getStart(), alleles, genotypes, REF_NEG_LOG_10_P_ERROR, new HashSet(), new HashMap()); - - WriteVCF.writeVCF(vc, writer, logger); - - return 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer addIn, Integer sum) { - return sum + addIn; - } - - public void onTraversalDone(Integer result) { - System.out.println("Processed " + result + " sites."); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java deleted file mode 100755 index b3136b54f..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.phasing; - -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.phasing.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.*; -import java.util.*; - -/** - * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. - */ -@Allows(value = {DataSource.READS, DataSource.REFERENCE}) -@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) -@By(DataSource.READS) - -@ReadFilters({ZeroMappingQualityReadFilter.class}) -// Filter out all reads with zero mapping quality - -public class ReadBasedPhasingValidationWalker extends RodWalker { - private LinkedList rodNames = null; - - @Argument(fullName = "sitePairsFile", shortName = "sitePairsFile", doc = "File of pairs of variants for which phasing in ROD should be assessed using input reads", required = true) - protected File sitePairsFile = null; - - @Output - protected PrintStream out; - - private Set sitePairs = null; - private String sampleName = null; - - SiteGenotypeAndReads prevSiteAndReads = null; - - private final static int NUM_IN_PAIR = 2; // trivial - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { - return true; - } - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add("variant"); - - sitePairs = new TreeSet(); - GenomeLocParser locParser = getToolkit().getGenomeLocParser(); - - InputStream sitePairsStream = null; - try { - sitePairsStream = new FileInputStream(sitePairsFile); - } catch (FileNotFoundException fnfe) { - fnfe.printStackTrace(); - throw new UserException("Problem opening file: " + sitePairsFile); - } - - AsciiLineReader sitePairsReader = new AsciiLineReader(sitePairsStream); - while (true) { - String line = null; - try { - line = sitePairsReader.readLine(); - } catch (IOException ioe) { - ioe.printStackTrace(); - throw new UserException("Problem reading file: " + sitePairsFile); - } - if (line == null) - break; // reached end of file - - String[] twoSites = line.split("\t"); - if (twoSites.length != 2) - throw new UserException("Must have PAIRS of sites in line " + line + " of " + sitePairsFile); - - SitePair sp = new SitePair(locParser.parseGenomeLoc(twoSites[0]), locParser.parseGenomeLoc(twoSites[1])); - sitePairs.add(sp); - } - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - boolean relevantSitePair = false; - SitePair sp = null; - if (prevSiteAndReads != null) { - // all vc's below start at ref.getLocus() [due to requireStartHere = true]: - sp = new SitePair(prevSiteAndReads.site, ref.getLocus()); - relevantSitePair = sitePairs.contains(sp); - } - - if (context == null || !context.hasBasePileup()) - return null; - ReadBackedPileup pileup = context.getBasePileup(); - String nextName = null; - - Collection sampNames = pileup.getSampleNames(); - if (sampNames.size() != 1) - throw new UserException("Reads must be for exactly one sample [not multi-sample]"); - nextName = sampNames.iterator().next(); - if (nextName == null) - throw new UserException("Reads must be for exactly one sample"); - - if (sampleName == null) - sampleName = nextName; - else if (!nextName.equals(sampleName)) - throw new UserException("Reads must have a single consistent sample name"); - - pileup = pileup.getPileupForSampleName(sampleName); - - ReadBasesAtPosition readBases = new ReadBasesAtPosition(); - for (PileupElement p : pileup) - readBases.putReadBase(p); - - ReadCounter rdCounts = null; - if (relevantSitePair) { // otherwise, processed the reads for their possible use in the future: - PhasingReadList buildReads = new PhasingReadList(NUM_IN_PAIR); - buildReads.updateBases(0, prevSiteAndReads.readBases); - buildReads.updateBases(1, readBases); - - List reads = new LinkedList(); - for (Map.Entry readEntry : buildReads.entrySet()) { - PhasingRead rd = readEntry.getValue(); - if (rd.getNonNullIndices().length == NUM_IN_PAIR) { // only want reads with BOTH bases called [possibly as deleted ("D")] - reads.add(rd); - logger.debug("Read name: " + readEntry.getKey() + "\trd: " + rd); - } - } - - // Count the occurence of each "haplotype": - rdCounts = new ReadCounter(); - for (PhasingRead rd : reads) - rdCounts.incrementCount(rd); - } - - - // Now, read the ROD and note the genotypes and their phase to be validated: - Set calledHaplotypes = null; - List allPossibleHaplotypes = null; - - boolean requireStartHere = true; // only see each VariantContext once - boolean takeFirstOnly = true; // take only the first entry from the ROD file - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - if (vc.isFiltered() || !vc.isSNP()) - continue; - - if (vc.getNSamples() != 1) - throw new UserException("ROD file must have exactly one sample [not multi-sample]"); - nextName = vc.getSampleNames().iterator().next(); - if (sampleName == null) - sampleName = nextName; - else if (!nextName.equals(sampleName)) - throw new UserException("ROD must have a single consistent sample name"); - - Genotype gt = vc.getGenotype(sampleName); - - if (relevantSitePair) { - Genotype prevGt = prevSiteAndReads.gt; - List prevAlleles = prevGt.getAlleles(); - List curAlleles = gt.getAlleles(); - - calledHaplotypes = new TreeSet(); // implemented Haplotype.compareTo() - if (gt.isPhased()) { - if (gt.getPloidy() != prevGt.getPloidy()) - throw new UserException("Invalid ROD file: cannot be phased AND have different ploidys!"); - - // Consider only the haplotypes called to be phased - Iterator curAllIt = curAlleles.iterator(); - for (Allele prevAll : prevAlleles) { - Allele curAll = curAllIt.next(); - calledHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll)); - } - } - - // Consider EVERY combination of alleles as haplotypes [IF PHASED, this will give the contingency table in the CORRECT order]: - allPossibleHaplotypes = new LinkedList(); - for (Allele prevAll : prevAlleles) { - for (Allele curAll : curAlleles) { - allPossibleHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll)); - } - } - } - - prevSiteAndReads = new SiteGenotypeAndReads(ref.getLocus(), gt, readBases); - } - - int processedPairs = 0; - if (relevantSitePair) { - Map haplotypeCounts = new TreeMap(); // implemented Haplotype.compareTo() - - processedPairs = 1; - int totalCount = rdCounts.totalCount(); - System.out.println("\nPair: " + sp + " [# reads = " + totalCount + "]"); - - int matchCount = 0; - for (Map.Entry rdEntry : rdCounts.entrySet()) { - PhasingRead read = rdEntry.getKey(); - int count = rdEntry.getValue(); - - Haplotype readsHaplotype = new Haplotype(read); - haplotypeCounts.put(readsHaplotype, count); - - boolean readMatchesCalledHaplotype = calledHaplotypes != null && calledHaplotypes.contains(readsHaplotype); - if (readMatchesCalledHaplotype) - matchCount += count; - - System.out.println("read" + ": " + read + (readMatchesCalledHaplotype ? "*" : "") + "\tcount: " + count); - } - - double percentMatchingReads = 100 * (matchCount / (double) totalCount); - System.out.println("% MATCHING reads: " + percentMatchingReads + " [of " + totalCount + " TOTAL reads]"); - - out.print(sp); - if (allPossibleHaplotypes != null) { - for (Haplotype hap : allPossibleHaplotypes) { - Integer count = haplotypeCounts.get(hap); - if (count == null) // haplotype may not have been observed in ANY reads - count = 0; - - out.print("\t" + count); - } - } - out.println(); - } - - return processedPairs; - } - - private Haplotype successiveAllelesToHaplotype(Allele prevAll, Allele curAll) { - byte prevBase = SNPallelePair.getSingleBase(prevAll); - byte curBase = SNPallelePair.getSingleBase(curAll); - - byte[] hapBases = new byte[NUM_IN_PAIR]; - hapBases[0] = prevBase; - hapBases[1] = curBase; - return new Haplotype(hapBases); - } - - public Integer reduce(Integer addIn, Integer runningCount) { - if (addIn == null) - addIn = 0; - - return runningCount + addIn; - } - - /** - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(Integer result) { - System.out.println("Validated " + result + " pairs of sites."); - } -} - -class SitePair implements Comparable { - public GenomeLoc site1; - public GenomeLoc site2; - - public SitePair(GenomeLoc site1, GenomeLoc site2) { - if (site1.size() > 1 || site2.size() > 1) - throw new UserException("Must give pairs of SINGLE-LOCUS record start sites"); - - this.site1 = site1; - this.site2 = site2; - } - - public String toString() { - return site1.toString() + "\t" + site2.toString(); - } - - public int compareTo(SitePair other) { - int comp1 = site1.compareTo(other.site1); - if (comp1 != 0) - return comp1; - - return site2.compareTo(other.site2); - } -} - -class SiteGenotypeAndReads { - public GenomeLoc site; - public Genotype gt; - public ReadBasesAtPosition readBases; - - public SiteGenotypeAndReads(GenomeLoc site, Genotype gt, ReadBasesAtPosition readBases) { - this.site = site; - this.gt = gt; - this.readBases = readBases; - } -} - -class PhasingReadList { - private Map readsAtSites = null; - private int numSites; - - public PhasingReadList(int numSites) { - this.readsAtSites = new HashMap(); - this.numSites = numSites; - } - - public void updateBases(int index, ReadBasesAtPosition readBases) { - if (readBases == null) - return; - - for (ReadBase rb : readBases) { - String readName = rb.readName; - - PhasingRead rd = readsAtSites.get(readName); - if (rd == null) { - rd = new PhasingRead(numSites, rb.mappingQual); - readsAtSites.put(readName, rd); - } - - // Arbitrarily updates to the last base observed for this sample and read (rb.base): - rd.updateBaseAndQuality(index, rb.base, rb.baseQual); - } - } - - public Set> entrySet() { - return readsAtSites.entrySet(); - } - - public int size() { - return readsAtSites.size(); - } -} - -class ReadCounter { - private Map counts; - private int totalCount; - - public ReadCounter() { - this.counts = new TreeMap(); // implemented PhasingRead.compareTo() - } - - public void incrementCount(PhasingRead rd) { - Integer cnt = counts.get(rd); - if (cnt == null) - cnt = 0; - - counts.put(rd, cnt + 1); - totalCount++; - } - - public Set> entrySet() { - return counts.entrySet(); - } - - public int totalCount() { - return totalCount; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java deleted file mode 100644 index dd5ace93d..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import org.broadinstitute.sting.utils.BaseUtils; -import com.google.java.contract.*; - -import java.util.EnumMap; -import java.util.Map; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** -* Created by IntelliJ IDEA. -* User: depristo -* Date: 4/8/11 -* Time: 2:55 PM -*/ -final class BaseCounts { - public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.A; - public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - - private final Map counts = new EnumMap(BaseIndex.class); // todo -- fixme -- include - and I events - { - for ( BaseIndex i : BaseIndex.values() ) - counts.put(i,0); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); - if ( i != null ) // no Ns - counts.put(i, counts.get(i) + 1); - } - - public byte baseWithMostCounts() { - return maxBaseIndex().getByte(); - } - - @Ensures("result >= 0") - public int countOfMostCommonBase() { - return counts.get(maxBaseIndex()); - } - - @Ensures("result >= 0") - public int totalCount() { - int sum = 0; - - for ( int c : counts.values() ) { - sum += c; - } - - return sum; - } - - @Ensures({ - "result != null", - "totalCount() != 0 || result == MAX_BASE_INDEX_WITH_NO_COUNTS"}) - private BaseIndex maxBaseIndex() { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for ( BaseIndex i : counts.keySet() ) { - if ( counts.get(i) > counts.get(maxI) ) { - maxI = i; - } - } - return maxI; - } - - @Ensures("result != null") - public String toString() { - StringBuilder b = new StringBuilder(); - for ( Map.Entry elt : counts.entrySet() ) { - b.append(elt.toString()).append("=").append(elt.getValue()).append(","); - } - return b.toString(); - } - - private enum BaseIndex { - A ( 'A', 0 ), - C ( 'C', 1 ), - G ( 'G', 2 ), - T ( 'T', 3 ), - D ( 'D', 4 ), - I ( 'I', 5 ); // insertion to the right of the base - - final byte b; - final int index; - private BaseIndex(char base, int index) { - this.b = (byte)base; - this.index = index; - } - - public byte getByte() { return b; } - - public static final BaseIndex byteToBase(final byte base) { - switch (base) { - case 'A': return A; - case 'C': return C; - case 'G': return G; - case 'T': return T; - case 'D': return D; - case 'I': return I; - default: return null; - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusReadCompressor.java deleted file mode 100644 index 207972283..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusReadCompressor.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import com.google.java.contract.*; -import net.sf.samtools.SAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 4/10/11 - * Time: 8:49 AM - * - * A general interface for ReadCompressors. Read compressors have the following semantics: - * - * The accept a stream of reads, in order, and after each added read returns a compressed stream - * of reads for emission. This stream of reads is a "reduced" representation of the total stream - * of reads. The actual compression approach is left up to the implementing class. - */ -public interface ConsensusReadCompressor { - /** - * Adds the read to the compressor. The returned iteratable collection of - * reads represents the incremental compressed output. - * @param read the next uncompressed read in the input stream to the compressor - * @return an iterator over the incrementally available compressed reads - */ - @Requires("read != null") - @Ensures("result != null") - Iterable addAlignment(SAMRecord read); - - /** - * Must be called after the last read has been added to finalize the compressor state - * and return the last compressed reads from the compressor. - * @return an iterator over the final compressed reads of this compressor - */ - @Ensures("result != null") - Iterable close(); -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java deleted file mode 100644 index 9a952e8ea..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java +++ /dev/null @@ -1,125 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -import java.util.*; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 4/8/11 - * Time: 3:01 PM - * - * Represents a single base site in the consensus calculation. A site corresponds to a place - * on the reference genome, or is a dummy site that is only used to calculate insertion statistics - */ -final class ConsensusSite { - final Collection overlappingReads = new LinkedList(); - final int offset, position; - final BaseCounts counts = new BaseCounts(); - - ConsensusSpan.Type markedType = null; - - public ConsensusSite(int position, int offset) { - this.position = position; - this.offset = offset; - } - - public int getPosition() { - return position; - } - - public Collection getOverlappingReads() { - return overlappingReads; - } - - /** - * Adds a pileup element (read / offset pair) to this consensus site. Assumes - * that the same element isn't added to site more than once. - * @param elt - */ - public void addOverlappingRead(PileupElement elt) { - overlappingReads.add(elt); - counts.incr(elt.getBase()); - } - - public boolean isStrongConsensus(final double maxFractionDisagreeingBases) { - int mostCommon = counts.countOfMostCommonBase(); - int total = counts.totalCount(); - double fractionCommonBase = (1.0 * mostCommon) / total; - return (1 - fractionCommonBase) < maxFractionDisagreeingBases; - } - - public final static class ConsensusBase { - byte base, qual; - - public byte getBase() { - return base; - } - - public byte getQual() { - return qual; - } - - public ConsensusBase(byte base, byte qual) { - this.base = base; - this.qual = qual; - } - } - - public ConsensusBase getConsensus() { - byte base = counts.baseWithMostCounts(); - int qual = 0; - - for ( PileupElement p : overlappingReads ) { - if ( p.getBase() == base ) - qual++; - } - - return new ConsensusBase(base, QualityUtils.boundQual(qual, (byte)64)); - } - - public String toString() { - return counts.toString(); - } - - public void setMarkedType(ConsensusSpan.Type markedType) { - this.markedType = markedType; - } - - public ConsensusSpan.Type getMarkedType() { - if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!"); - return markedType; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpan.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpan.java deleted file mode 100644 index aa32d43f6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpan.java +++ /dev/null @@ -1,109 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import com.google.java.contract.*; -import org.broadinstitute.sting.utils.GenomeLoc; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 4/8/11 - * Time: 3:01 PM - * - * Represents a span of a consensus region (conserved, or variable) on the reference genome. Supports - * either absolute or relative (refStart) positioning of the span. - */ -final class ConsensusSpan { - - /** - * The type of an span is either conserved (little variability within the span) or - * variable (too many differences among the reads in the span to determine the exact - * haplotype sequence). - */ - public enum Type { - CONSERVED, VARIABLE; - - public static Type otherType(Type t) { - switch ( t ) { - case CONSERVED: return VARIABLE; - case VARIABLE: return CONSERVED; - } - return CONSERVED; - } - } - - - final int refStart; // the start position on the reference for relative calculations - final GenomeLoc loc; - final Type consensusType; - - @Requires({"refStart >= 0", "loc != null", "consensusType != null"}) - @Ensures({"this.refStart == refStart", "this.loc.equals(loc)", "this.consensusType.equals(consensusType)"}) - public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusSpan.Type consensusType) { - if ( refStart < 0 ) throw new RuntimeException("RefStart must be greater than 0: " + refStart); - if ( loc == null ) throw new RuntimeException("Loc must not be null"); - if ( consensusType == null ) throw new RuntimeException("ConsensusType must not be null"); - - this.refStart = refStart; - this.loc = loc; - this.consensusType = consensusType; - } - - public int getOffsetFromStartOfSites() { - return loc.getStart() - refStart; - } - - @Ensures("result >= 0") - public int getGenomeStart() { - return loc.getStart(); - } - - @Ensures("result >= 0") - public int getGenomeStop() { - return loc.getStop(); - } - - public ConsensusSpan.Type getConsensusType() { - return consensusType; - } - - @Ensures("result >= 0") - public int size() { - return getGenomeStop() - getGenomeStart() + 1; - } - - @Ensures("result == !isVariable()") - public boolean isConserved() { return getConsensusType() == Type.CONSERVED; } - - @Ensures("result == !isConserved()") - public boolean isVariable() { return getConsensusType() == Type.VARIABLE; } - - @Ensures("result != null") - public String toString() { - return String.format("%s %s", consensusType, loc); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java deleted file mode 100644 index 6d4c9ded5..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * - * @author depristo - * @version 0.1 - */ -public class MultiSampleConsensusReadCompressor implements ConsensusReadCompressor { - protected static final Logger logger = Logger.getLogger(MultiSampleConsensusReadCompressor.class); - - Map compressorsPerSample = new HashMap(); - - public MultiSampleConsensusReadCompressor(SAMFileHeader header, - final int readContextSize, - final GenomeLocParser glParser, - final int minBpForRunningConsensus, - final int targetDepthAtVariableSites) { - for ( String name : SampleUtils.getSAMFileSamples(header) ) { - compressorsPerSample.put(name, - new SingleSampleConsensusReadCompressor(name, readContextSize, - glParser, minBpForRunningConsensus, targetDepthAtVariableSites)); - // todo -- argument for minConsensusSize - } - } - - public Collection getReducedReadGroups() { - List rgs = new ArrayList(); - - for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) { - rgs.add(comp.getReducedReadGroup()); - } - - return rgs; - } - - @Override - public Iterable addAlignment(SAMRecord read) { - String sample = read.getReadGroup().getSample(); - SingleSampleConsensusReadCompressor compressor = compressorsPerSample.get(sample); - if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sample); - return compressor.addAlignment(read); - } - - @Override - public Iterable close() { - List reads = new LinkedList(); - for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) - for ( SAMRecord read : comp.close() ) - reads.add(read); - return reads; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsWalker.java deleted file mode 100755 index 88e697e14..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsWalker.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.*; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: April 7, 2011 - */ -public class ReduceReadsWalker extends ReadWalker { - @Output - protected StingSAMFileWriter out; - - @Output(fullName="bedOut", shortName = "bedOut", doc="BED output", required = false) - protected PrintStream bedOut = null; - - @Argument(fullName = "contextSize", shortName = "CS", doc = "", required = false) - protected int contextSize = 10; - - @Argument(fullName = "INCLUDE_RAW_READS", shortName = "IRR", doc = "", required = false) - protected boolean INCLUDE_RAW_READS = false; - - @Argument(fullName = "useRead", shortName = "UR", doc = "", required = false) - protected Set readNamesToUse; - - @Argument(fullName = "minBpForRunningConsensus", shortName = "mbrc", doc = "", required = false) - protected int minBpForRunningConsensus = 1000; - - @Argument(fullName = "maxReadsAtVariableSites", shortName = "mravs", doc = "", required = false) - protected int maxReadsAtVariableSites = 500; - - protected int totalReads = 0; - int nCompressedReads = 0; - - MultiSampleConsensusReadCompressor compressor; - - @Override - public void initialize() { - super.initialize(); - - compressor = new MultiSampleConsensusReadCompressor(getToolkit().getSAMFileHeader(), - contextSize, getToolkit().getGenomeLocParser(), - minBpForRunningConsensus, maxReadsAtVariableSites); - - out.setPresorted(false); - - for ( SAMReadGroupRecord rg : compressor.getReducedReadGroups()) - out.getFileHeader().addReadGroup(rg); - } - - @Override - public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { - totalReads++; - return read; // all the work is done in the reduce step for this walker - } - - - /** - * reduceInit is called once before any calls to the map function. We use it here to setup the output - * bam file, if it was specified on the command line - * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise - */ - @Override - public ConsensusReadCompressor reduceInit() { - return compressor; - } - - /** - * given a read and a output location, reduce by emitting the read - * @param read the read itself - * @return the SAMFileWriter, so that the next reduce can emit to the same source - */ - public ConsensusReadCompressor reduce( SAMRecord read, ConsensusReadCompressor comp ) { - if ( readNamesToUse == null || readNamesToUse.contains(read.getReadName()) ) { - if ( INCLUDE_RAW_READS ) - out.addAlignment(read); - - // write out compressed reads as they become available - for ( SAMRecord consensusRead : comp.addAlignment(read) ) { - out.addAlignment(consensusRead); - nCompressedReads++; - } - } - - return comp; - } - - @Override - public void onTraversalDone( ConsensusReadCompressor compressor ) { - //compressor.writeConsensusBed(bedOut); - // write out any remaining reads - for ( SAMRecord consensusRead : compressor.close() ) { - out.addAlignment(consensusRead); - nCompressedReads++; - } - - double percent = (100.0 * nCompressedReads) / totalReads; - logger.info("Compressed reads : " + nCompressedReads + String.format(" (%.2f%%)", percent)); - logger.info("Total reads : " + totalReads); - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java deleted file mode 100755 index af06ba952..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java +++ /dev/null @@ -1,108 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Apr 14, 2009 - * Time: 8:54:05 AM - * To change this template use File | Settings | File Templates. - */ -public class RefPileupElement extends PileupElement { - final int refOffset; - - public RefPileupElement(SAMRecord read, int offset, int refOffset) { - super(read, offset); - this.refOffset = refOffset; - if ( refOffset < 0 ) - throw new ReviewedStingException("Bad RefPileupElement: ref offset < 0: " + refOffset + " for read " + read); - } - - public int getRefOffset() { - return refOffset; - } - - public static Iterable walkRead(SAMRecord read) { - return walkRead(read, 0); - } - - public static Iterable walkRead(final SAMRecord read, final int refIStart) { - return new Iterable() { - public Iterator iterator() { - List elts = new ArrayList(); - - // todo -- need to be ++X not X++ operators. The refI should go from -1, for reads like 2I2M, - // todo -- so that we can represent insertions to the left of the read - int readI = 0, refI = read.getAlignmentStart() - refIStart; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - break; - case H : case P : // ignore pads and hard clips - break; - case S : - //refI += l; // move the reference too, in addition to I - readI += l; - break; - case I : - for ( int i = 0; i < l; i++) - if ( refI >= 0 ) - readI++; - // todo -- replace when insertions handled correctly -// elts.add(new RefPileupElement(read, readI++, refI)); - break; - case D : - for ( int i = 0; i < l; i++) - if ( refI >= 0 ) - elts.add(new RefPileupElement(read, -1, refI++)); - break; - case M : - for ( int i = 0; i < l; i++) - if ( refI >= 0 ) - elts.add(new RefPileupElement(read, readI++, refI++)); - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - - return elts.iterator(); - } - }; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java deleted file mode 100644 index 9c5f28f6d..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java +++ /dev/null @@ -1,482 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.*; -import org.apache.commons.math.stat.descriptive.summary.Sum; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.clipreads.ClippingOp; -import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; -import org.broadinstitute.sting.utils.clipreads.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.PrintStream; -import java.util.*; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * - * @author depristo - * @version 0.1 - */ -public class SingleSampleConsensusReadCompressor implements ConsensusReadCompressor { - protected static final Logger logger = Logger.getLogger(SingleSampleConsensusReadCompressor.class); - private static final boolean DEBUG = false; - private static final boolean INVERT = false; - private static final boolean PRINT_CONSENSUS_READS = false; - private static final int CYCLES_BEFORE_RETRY = 1000; - private static final double MAX_FRACTION_DISAGREEING_BASES = 0.1; - private static final ClippingRepresentation VARIABLE_READ_REPRESENTATION = ClippingRepresentation.SOFTCLIP_BASES; - private static final double MIN_FRACT_BASES_FOR_VARIABLE_READ = 0.33; // todo -- should be variable - private static final int MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ = 10; - protected final static String RG_POSTFIX = ".ReducedReads"; - public final static int REDUCED_READ_BASE_QUALITY = 30; - - // todo -- should merge close together spans - - /** The place where we ultimately write out our records */ - Queue waitingReads = new LinkedList(); - - final int readContextSize; - final int targetDepthAtVariableSites; - final int minBpForRunningConsensus; - int retryTimer = 0; - int consensusCounter = 0; - - final SAMReadGroupRecord reducedReadGroup; - String contig = null; - final GenomeLocParser glParser; - SAMFileHeader header; - GenomeLoc lastProcessedRegion = null; - - public SingleSampleConsensusReadCompressor(final String sampleName, - final int readContextSize, - final GenomeLocParser glParser, - final int minBpForRunningConsensus, - final int targetDepthAtVariableSites) { - this.readContextSize = readContextSize; - this.glParser = glParser; - this.minBpForRunningConsensus = minBpForRunningConsensus; - this.targetDepthAtVariableSites = targetDepthAtVariableSites; - this.reducedReadGroup = createReducedReadGroup(sampleName); - } - - /** - * Helper function to create a read group for these reduced reads - * @param sampleName - * @return - */ - private static final SAMReadGroupRecord createReducedReadGroup(final String sampleName) { - SAMReadGroupRecord rg = new SAMReadGroupRecord(sampleName + RG_POSTFIX); - rg.setSample(sampleName); - return rg; - } - - public SAMReadGroupRecord getReducedReadGroup() { - return reducedReadGroup; - } - - // ------------------------------------------------------------------------------------------ - // - // public interface functions - // - // ------------------------------------------------------------------------------------------ - - /** - * @{inheritDoc} - */ - @Override - public Iterable addAlignment( SAMRecord read ) { - if ( contig == null ) - contig = read.getReferenceName(); - if ( ! read.getReferenceName().equals(contig) ) - throw new ReviewedStingException("ConsensusRead system doesn't support multiple contig processing right now"); - - if ( header == null ) - header = read.getHeader(); - - if ( ! waitingReads.isEmpty() && read.getAlignmentStart() < waitingReads.peek().getAlignmentStart() ) - throw new ReviewedStingException( - String.format("Adding read %s starting at %d before current queue head start position %d", - read.getReadName(), read.getAlignmentStart(), waitingReads.peek().getAlignmentStart())); - - Collection result = Collections.emptyList(); - if ( retryTimer == 0 ) { - if ( chunkReadyForConsensus(read) ) { - result = consensusReads(false); - } - } else { - //logger.info("Retry: " + retryTimer); - retryTimer--; - } - - if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() ) - waitingReads.add(read); - - return result; - } - - @Override - public Iterable close() { - return consensusReads(true); - } - - // ------------------------------------------------------------------------------------------ - // - // private implementation functions - // - // ------------------------------------------------------------------------------------------ - - private boolean chunkReadyForConsensus(SAMRecord read) { - if ( ! waitingReads.isEmpty() ) { - SAMRecord firstRead = waitingReads.iterator().next(); - int refStart = firstRead.getAlignmentStart(); - int refEnd = read.getAlignmentStart(); - int size = refEnd - refStart; - return size > minBpForRunningConsensus; - } else - return false; - } - -// -// public void writeConsensusBed(PrintStream bedOut) { -// for ( ConsensusSite site : calculateConsensusSites(waitingReads) ) { -// GenomeLoc loc = site.getLoc(); -// bedOut.printf("%s\t%d\t%d\t%s%n", loc.getContig(), loc.getStart()-1, loc.getStop(), site.counts); -// } -// } - - private Collection consensusReads(boolean useAllRemainingReads) { - if ( ! waitingReads.isEmpty() ) { - logger.info("Calculating consensus reads"); - List sites = calculateConsensusSites(waitingReads, useAllRemainingReads, lastProcessedRegion); - List rawSpans = calculateSpans(sites); - List spans = useAllRemainingReads ? rawSpans : excludeFinalSpan(rawSpans); - - if ( ! spans.isEmpty() ) { - lastProcessedRegion = spannedRegion(spans); - logger.info("Processing region: " + lastProcessedRegion); - updateWaitingReads(sites, spans); - return consensusReadsFromSitesAndSpans(sites, spans); - } else { - logger.info("Danger, spans is empty, may experience poor performance at: " + spannedRegion(rawSpans)); - retryTimer = CYCLES_BEFORE_RETRY; - return Collections.emptyList(); - } - } else { - return Collections.emptyList(); - } - } - - private static final List excludeFinalSpan(List rawSpans) { - logger.info("Dropping final, potentially incomplete span: " + rawSpans.get(rawSpans.size()-1)); - return rawSpans.subList(0, rawSpans.size() - 1); - } - - private static final GenomeLoc spannedRegion(List spans) { - GenomeLoc region = spans.get(0).loc; - for ( ConsensusSpan span : spans ) - region = region.merge(span.loc); - return region; - } - - private void updateWaitingReads(List sites, List spans) { - ConsensusSpan lastSpan = spans.get(spans.size() - 1); - Set unprocessedReads = new HashSet(); - - for ( ConsensusSite site : sites.subList(lastSpan.getOffsetFromStartOfSites() + 1, sites.size()) ) { - for ( PileupElement p : site.getOverlappingReads() ) - unprocessedReads.add(p.getRead()); - } - - logger.info(String.format("Updating waiting reads: old=%d reads, new=%d reads", waitingReads.size(), unprocessedReads.size())); - waitingReads = new LinkedList(ReadUtils.coordinateSortReads(new ArrayList(unprocessedReads))); - } - - private List expandVariableSites(List sites) { - for ( ConsensusSite site : sites ) - site.setMarkedType(ConsensusSpan.Type.CONSERVED); - - for ( int i = 0; i < sites.size(); i++ ) { - ConsensusSite site = sites.get(i); - if ( ! site.isStrongConsensus(MAX_FRACTION_DISAGREEING_BASES) ) { - int start = Math.max(i - readContextSize, 0); - int stop = Math.min(sites.size(), i + readContextSize + 1); - for ( int j = start; j < stop; j++ ) { - // aggressive tagging -- you are only conserved if you are never variable - sites.get(j).setMarkedType(ConsensusSpan.Type.VARIABLE); - } - } - } - - return sites; - } - - private List calculateSpans(List rawSites) { - List sites = expandVariableSites(rawSites); - List spans = new ArrayList(); - int start = 0; - - // our first span type is the type of the first site - ConsensusSpan.Type consensusType = sites.get(0).getMarkedType(); - while ( start < sites.size() ) { - ConsensusSpan span = findSpan(sites, start, consensusType); - - if ( span == null ) // we are done - return spans; - else { - spans.add(span); - start += span.size(); - } - - consensusType = ConsensusSpan.Type.otherType(consensusType); - } - - return spans; - } - - private ConsensusSpan findSpan(List sites, int start, ConsensusSpan.Type consensusType) { - int refStart = sites.get(0).getPosition(); - - for ( int end = start + 1; end < sites.size(); end++ ) { - ConsensusSite site = sites.get(end); - boolean conserved = site.getMarkedType() == ConsensusSpan.Type.CONSERVED; - if ( (consensusType == ConsensusSpan.Type.CONSERVED && ! conserved) || - (consensusType == ConsensusSpan.Type.VARIABLE && conserved) || - end + 1 == sites.size() ) { // we are done with the complete interval - GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1); - return new ConsensusSpan(refStart, loc, consensusType); - } - } - - return null; // couldn't find anything - } - - - private List calculateConsensusSites(Collection reads, boolean useAllRemainingReads, GenomeLoc lastProcessedRegion) { - List consensusSites = createEmptyConsensusSites(reads, lastProcessedRegion); - int refStart = consensusSites.get(0).getPosition(); - - for ( SAMRecord read : reads ) { - for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) { - // add to the consensus at this site - if ( p.getRefOffset() >= consensusSites.size() ) - throw new ReviewedStingException("BUG: ref offset off the consensus site list: " + p.getRead() + " at " + p.getRefOffset()); - consensusSites.get(p.getRefOffset()).addOverlappingRead(p); - } - } - - return consensusSites; - } - - private static List createEmptyConsensusSites(Collection reads, GenomeLoc lastProcessedRegion) { - SAMRecord firstRead = reads.iterator().next(); - - int minStart = lastProcessedRegion == null ? -1 : lastProcessedRegion.getStop() + 1; - int refStart = Math.max(firstRead.getAlignmentStart(), minStart); - int refEnd = furtherestEnd(reads); - - logger.info("Calculating sites for region " + refStart + " to " + refEnd); - // set up the consensus site array - List consensusSites = new ArrayList(); - int len = refEnd - refStart + 1; - for ( int i = 0; i < len; i++ ) { - int position = refStart + i; - //GenomeLoc loc = glParser.createGenomeLoc(contig, l, l); - consensusSites.add(new ConsensusSite(position, i)); - } - - return consensusSites; - } - - private List consensusReadsFromSitesAndSpans(List sites, List spans) { - List reads = new ArrayList(); - - for ( ConsensusSpan span : spans ) { - //logger.info("Span is " + span); - if ( span.isConserved() ) - reads.addAll(conservedSpanReads(sites, span)); - else - reads.addAll(downsample(variableSpanReads(sites, span), span)); - } - - return reads; - } - - /** - * Downsamples the reads until we have 2x the ideal target depth in the span. - * - * todo: perhaps it would be better to smooth coverage, so that the probability of - * todo: retaining a read would be proportional to the over-coverage of each site - * - * @param reads - * @param span - * @return - */ - private Collection downsample(Collection reads, ConsensusSpan span) { - // ideally, we would have exactly span bp at target depth, x2 for the directionality of reads - int idealBPinSpan = span.size() * targetDepthAtVariableSites * 2; - int rawBPinSpan = readsBP(reads); - - // The chance we want to keep a particular bp is ideal / actual - double pKeepPerBP = (1.0 * idealBPinSpan) / rawBPinSpan; - - if ( pKeepPerBP >= 1.0 ) { // not enough coverage - return reads; - } else { // we don'need to downsample - List downsampled = new ArrayList(); - for ( SAMRecord read : reads ) { - // should this be proportional to read length? - double pKeep = pKeepPerBP; // * read.getReadLength(); - if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < pKeep ) { - downsampled.add(read); - } - } - - logger.info(String.format("targetDepth=%d, idealBP=%d, rawBP=%d, pKeepPerBP=%.2e, nRawReads=%d, nKeptReads=%d, keptBP=%d", - targetDepthAtVariableSites, idealBPinSpan, rawBPinSpan, pKeepPerBP, reads.size(), downsampled.size(), readsBP(downsampled))); - return downsampled; - } - } - - private static final int readsBP(Collection reads) { - int sum = 0; - for ( SAMRecord read : reads ) sum += read.getReadLength(); - return sum; - } - - private List conservedSpanReads(List sites, ConsensusSpan span) { - byte[] bases = new byte[span.size()]; - byte[] quals = new byte[span.size()]; - - for ( int i = 0; i < span.size(); i++ ) { - int refI = i + span.getOffsetFromStartOfSites(); - ConsensusSite site = sites.get(refI); - if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE ) - throw new ReviewedStingException("Variable site included in consensus: " + site); - final int count = site.counts.countOfMostCommonBase(); - byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); - if ( !BaseUtils.isRegularBase(base) ) { - // todo -- this code needs to be replaced with cigar building code as well - logger.warn("Substituting N for non-regular consensus base " + (char)base); - base = (byte)'N'; - } - - bases[i] = base; - quals[i] = QualityUtils.boundQual(count, (byte)64); - } - - SAMRecord consensus = new SAMRecord(header); - consensus.setAttribute("RG", reducedReadGroup.getId()); - consensus.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, Integer.valueOf(REDUCED_READ_BASE_QUALITY)); - consensus.setReferenceName(contig); - consensus.setReadName(String.format("%s.read.%d", reducedReadGroup.getId(), consensusCounter++)); - consensus.setReadPairedFlag(false); - consensus.setReadUnmappedFlag(false); - consensus.setCigarString(String.format("%dM", span.size())); - consensus.setAlignmentStart(span.getGenomeStart()); - consensus.setReadBases(bases); - consensus.setBaseQualities(quals); - consensus.setMappingQuality(60); - -// if ( INVERT && PRINT_CONSENSUS_READS ) -// for ( SAMRecord read : consensusReads ) -// finalDestination.addAlignment(read); - - return Collections.singletonList(consensus); - } - - @Requires({"sites != null", "span.isVariable()"}) - @Ensures("result != null") - private Collection variableSpanReads(List sites, ConsensusSpan span) { - Collection reads = new LinkedList(); - Set readNames = new HashSet(); - - for ( int i = 0; i < span.size(); i++ ) { - int refI = i + span.getOffsetFromStartOfSites(); - - for ( PileupElement p : sites.get(refI).getOverlappingReads() ) { - if ( readNames.contains(p.getRead().getReadName()) ) { - ; - //logger.info("Rejecting already seen read: " + p.getRead().getReadName()); - } else { - readNames.add(p.getRead().getReadName()); - SAMRecord read = clipReadToSpan(p.getRead(), span); - if ( keepClippedReadInVariableSpan(p.getRead(), read) ) - reads.add(read); - } - } - } - - return reads; - } - - private final static boolean keepClippedReadInVariableSpan(SAMRecord originalRead, SAMRecord variableRead) { - int originalReadLength = originalRead.getReadLength(); - int variableReadLength = variableRead.getReadLength(); - - return variableReadLength >= MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ; -// && -// ((1.0 * variableReadLength) / originalReadLength) >= MIN_FRACT_BASES_FOR_VARIABLE_READ; - } - - private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) { - ReadClipper clipper = new ReadClipper(read); - int spanStart = span.getGenomeStart(); - int spanEnd = span.getGenomeStop(); - int readLen = read.getReadLength(); - - for ( RefPileupElement p : RefPileupElement.walkRead(read) ) { - if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) { - clipper.addOp(new ClippingOp(0, p.getOffset() - 1)); - } - - if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) { - clipper.addOp(new ClippingOp(p.getOffset() + 1, readLen - 1)); - } - } - - SAMRecord softClipped = clipper.clipRead(VARIABLE_READ_REPRESENTATION); - return ReadUtils.hardClipSoftClippedBases(softClipped); - } - - private static int furtherestEnd(Collection reads) { - int end = -1; - for ( SAMRecord read : reads ) { - end = Math.max(end, read.getAlignmentEnd()); - } - return end; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java deleted file mode 100755 index 8c1fca4c6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java +++ /dev/null @@ -1,233 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.replication_validation; - -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.MathUtils; - -import java.io.PrintStream; - -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.*; - -/** - * Implementation of the replication and validation framework with reference based error model - * for pooled sequencing. - * - * The input should be a BAM file with pooled sequencing data where each pool is represented by - * samples with the same barcode. - * - * A reference sample name must be provided and it must be barcoded uniquely. - */ -@Requires(value={},referenceMetaData={@RMD(name="reference", type=VariantContext.class)}) -public class ReplicationValidationWalker extends LocusWalker implements TreeReducible { - - - @Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=true) - String referenceSampleName; - - @Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true) - int nSamples = -1; - - @Argument(shortName="nchr", fullName="number_of_chromosomes", doc="Number of chromosomes per sample (in case you're not dealing with diploids). Default: 2.", required=false) - int nChromosomes = 2; - - @Argument(shortName="maxac", fullName="max_allele_count", doc="Max number of alleles expected in a site. Smaller numbers process faster. Default: 2 * number of samples. ", required=false) - int overrideMaxAlleleCount = -1; - - @Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Smaller numbers process faster. Default: Q40.", required=false) - int maxQualityScore= 40; - - @Argument(shortName="prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false) - byte defaultPrior= 20; - - @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) - boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - - - @Output(doc="Write output to this file instead of STDOUT") - PrintStream out; - - int maxAlleleCount; - - final String REFERENCE_ROD_NAME = "reference"; - - - /** - * GATK Engine creates readgroups of the form XXX.Y.Z - * XXX.Y is the unique lane identifier. - * Z is the id of the sample to make the read group id unique - * This function returns a list of unique lane identifiers. - * @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup) - * @return a collection of lane ids. - */ - private Set getLaneIDs(Collection readGroups) { - HashSet result = new HashSet(); - for (String rgid : readGroups) { - String [] parsedId = rgid.split("\\."); - result.add(parsedId[0] + "." + parsedId[1]); - } - return result; - } - - /** - * Calculates he probability of the data (reference sample reads) given the phred scaled site quality score. - * @param referenceSamplePileup reference sample pileup - * @param refBases base from the reference sequence for this site - * @param phredScaledPrior phred scaled expected site quality (prior) - * @return an array of log10 probabilities of site qualities ranging from Q1-Q40. - */ - private double[] buildErrorModel (ReadBackedPileup referenceSamplePileup, Collection refBases, byte phredScaledPrior) { - double [] model = new double[maxQualityScore+1]; - byte [] data = referenceSamplePileup.getBases(); - - int coverage = data.length; - int mismatches = getNumberOfMismatches(data, refBases); - int matches = coverage - mismatches; - - for (byte q=0; q<=maxQualityScore; q++) { - double probMismatch = MathUtils.phredScaleToProbability(q); - model[q] = MathUtils.phredScaleToLog10Probability(phredScaledPrior) + - MathUtils.log10BinomialCoefficient(coverage, mismatches) + - mismatches * Math.log10(probMismatch) + - matches * Math.log10(1-probMismatch); - } - return model; - } - - /** - * Returns the number of mismatching bases in a pileup - * @param data the bases of a pileup - * @param refBase the reference sample base to compare to - * @return number of bases in data that are different from refBase - */ - private int getNumberOfMismatches (byte[] data, Collection refBase) { - int mismatches = 0; - for (byte seqBase : data) { - if (!refBase.contains(seqBase)) - mismatches++; - } - return mismatches; - } - - /** - * Returns the true bases for the reference sample in this locus. Homozygous loci will return one base - * but heterozygous will return two bases (hence why it returns a collection). - * - * @param referenceSampleContext the variant context from the reference sample ROD track - * @param ref the reference sequence context - * @return the true bases for the reference sample. - */ - private Collection getTrueBases(VariantContext referenceSampleContext, ReferenceContext ref) { - - ArrayList trueReferenceBase = new ArrayList(); - - // Site is not a variant, take from the reference - if (referenceSampleContext == null) { - trueReferenceBase.add(ref.getBase()); - } - - else if (referenceSampleContext.isIndel()) { - return null; // TODO: add special treatment for extended events. For Now just skip these altogether. - } - - // Site has a VCF entry -- is variant - else { - // Site is filtered, don't mess with it if option is set - if (referenceSampleContext.isFiltered() && EXCLUDE_FILTERED_REFERENCE_SITES) { - return null; - } - - Genotype referenceGenotype = referenceSampleContext.getGenotype(referenceSampleName); - List referenceAlleles = referenceGenotype.getAlleles(); - for (Allele allele : referenceAlleles) { - byte [] bases = allele.getBases(); - for (byte b : bases) { - trueReferenceBase.add(b); - } - } - } - return trueReferenceBase; - } - - public void initialize() { - - // Set the max allele count (defines the size of the error model array) - maxAlleleCount = (overrideMaxAlleleCount > 0) ? overrideMaxAlleleCount : nSamples*nChromosomes; - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - // Get reference base from VCF or Reference - VariantContext referenceSampleContext = tracker.getVariantContext(ref, REFERENCE_ROD_NAME, context.getLocation()); - Collection trueReferenceBases = getTrueBases(referenceSampleContext, ref); - - // If there is no true reference base in this locus, skip it. - if (trueReferenceBases == null) - return 0; - - ReadBackedPileup contextPileup = context.getBasePileup(); - Set lanesInLocus = getLaneIDs(contextPileup.getReadGroups()); - for (String laneID : lanesInLocus) { - // make a pileup for this lane - ReadBackedPileup lanePileup = contextPileup.getPileupForLane(laneID); - Collection samplesInLane = lanePileup.getSampleNames(); - - // we can only analyze loci that have reads for the reference sample - if (samplesInLane.contains(referenceSampleName)) { - - // build reference sample pileup - ReadBackedPileup referenceSamplePileup = lanePileup.getPileupForSampleName(referenceSampleName); - - // Build error model - double [] errorModel = buildErrorModel(referenceSamplePileup, trueReferenceBases, defaultPrior); - - // iterate over all samples (pools) in this lane except the reference - samplesInLane.remove(referenceSampleName); - for (String pool : samplesInLane) { - ReadBackedPileup poolPileup = lanePileup.getPileupForSampleName(pool); - - // Debug error model - if (referenceSamplePileup.getBases().length > 50) { - System.out.println("\n" + laneID + " - " + pool + ": " + referenceSamplePileup.getBases().length); - for (double v : errorModel) - System.out.print(v + ", "); - System.out.println(); - } - } - } - // todo: call each pool for this site - // todo: merge pools - // todo: decide whether or not it's a variant - } - return 1; - } - - public Long reduceInit() { - return 0l; - } - - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - public Long treeReduce(Long lhs, Long rhs) { - return lhs + rhs; - } - - public void onTraversalDone( Long c ) { - out.println(c); - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/DeclareValidityWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/DeclareValidityWalker.java deleted file mode 100644 index 95266d2e9..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/DeclareValidityWalker.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.validation; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; - -import java.util.*; - - -/** - * Declares the validity of variants in a vcf as either true or false. For use with the IGV crowd-sourcing bed generation - */ - -@Requires(value={},referenceMetaData=@RMD(name="validated", type=VariantContext.class)) -public class DeclareValidityWalker extends RodWalker{ - - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName = "validity", shortName = "V", - doc = "Rank of variant validity on a 0-4 scale where 0 is definitely false positive; 4 is definitely true positive.") - int validity; - - @Argument(fullName = "Note", shortName = "N", doc = "Annotation to be included in FP/TP field", required = false) - String note ="."; - - @Argument(fullName = "Source", shortName = "s", doc = "Institutional source of annotation", required = false) - String source = "."; - - @Argument(fullName = "Build", shortName = "bld", doc = "Genome build", required = false) - String build = "."; - - - - public Integer reduceInit() { - - for(char a : note.toCharArray()){ - if(Character.isWhitespace(a)) throw new UserException("White space is not allowed in VCF Info fields, please omit it from your build, note, and source arguments."); - } - - for(char b : source.toCharArray()){ - if(Character.isWhitespace(b)) throw new UserException("White space is not allowed in VCF Info fields, please omit it from your build, note, and source arguments."); - } - - Set old = VCFUtils.getHeaderFields(getToolkit()); - Set newlines = new HashSet(); - for(VCFHeaderLine each : old){ - if(each.getKey().equals("fileformat")) newlines.add(each); - } - - if (build.equals(".")) build=getBuild(); - VCFHeaderLine ref = new VCFHeaderLine("reference", build); - newlines.add(ref); - vcfWriter.writeHeader(new VCFHeader(newlines)); - - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - - - - - - - public Map addValidation(int Validity, String Note, String Source){ - - - - HashMap validityAnnots = new HashMap(); - validityAnnots.put("validity", Validity); - validityAnnots.put("user", System.getenv("USER")); - validityAnnots.put("note", Note); - validityAnnots.put("Source", Source); - - return validityAnnots; - } - - public String getBuild(){ - String refPath = getToolkit().getArguments().referenceFile.getPath(); - if (refPath.contains("19")) {return "hg19";} - else if (refPath.contains("18")) {return "hg18";} - else if (refPath.contains("36")) {return "b36";} - else if (refPath.contains("37")) {return "b37";} - else {return "unknown";} - } - /** - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context){ - if ( tracker == null ) - return 0; - - - VariantContext current = tracker.getVariantContext(ref, "validated", context.getLocation()); - if (current == null) { - return 0;} - - VariantContext declared = VariantContext.modifyAttributes( current, addValidation(validity, note, source)); - vcfWriter.add(declared, ref.getBase()); - return 1; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter+sum; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationWalker.java deleted file mode 100644 index 9e832fde6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationWalker.java +++ /dev/null @@ -1,153 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.validation; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.*; -import java.math.BigInteger; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.Collection; -import java.util.List; - -/** - * a walker for validating (in the style of validating pile-up) the ROD system. - */ -@Reference(window=@Window(start=-40,stop=40)) -public class RodSystemValidationWalker extends RodWalker { - - // the divider to use in some of the text output - private static final String DIVIDER = ","; - - @Output - public PrintStream out; - - @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts") - public boolean allRecordsVariantContextEquivalent = false; - - // used to calculate the MD5 of a file - MessageDigest digest = null; - - // we sometimes need to know what rods the engine's seen - List rodList; - - /** - * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change - * underneath us). - */ - public void initialize() { - // setup the MD5-er - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new ReviewedStingException("Unable to find MD5 checksumer"); - } - out.println("Header:"); - // enumerate the list of ROD's we've loaded - rodList = this.getToolkit().getRodDataSources(); - for (ReferenceOrderedDataSource rod : rodList) { - out.println(rod.getName() + DIVIDER + rod.getType()); - out.println(rod.getName() + DIVIDER + rod.getFile()); - out.println(rod.getName() + DIVIDER + md5sum(rod.getFile())); - } - out.println("Data:"); - } - - /** - * - * @param tracker the ref meta data tracker to get RODs - * @param ref reference context - * @param context the reads - * @return an 1 for each site with a rod(s), 0 otherwise - */ - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - int ret = 0; - if (tracker != null && tracker.getAllRods().size() > 0) { - out.print(context.getLocation() + DIVIDER); - Collection features = tracker.getAllRods(); - for (GATKFeature feat : features) - out.print(feat.getName() + DIVIDER); - out.println(";"); - ret++; - } - - // if the argument was set, check for equivalence - if (allRecordsVariantContextEquivalent && tracker != null) { - Collection col = tracker.getAllVariantContexts(ref); - VariantContext con = null; - for (VariantContext contextInList : col) - if (con == null) con = contextInList; - else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con); - } - return ret; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - @Override - public Integer reduceInit() { - return 0; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - @Override - public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? - out.println("[REDUCE RESULT] Traversal result is: " + result); - } - - // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html - private String md5sum(File f) { - InputStream is; - try { - is = new FileInputStream(f); - } catch (FileNotFoundException e) { - return "Not a file"; - } - byte[] buffer = new byte[8192]; - int read = 0; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - } - catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } - finally { - try { - is.close(); - } - catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantutils/CGVarToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantutils/CGVarToVCF.java deleted file mode 100755 index 51951e56c..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantutils/CGVarToVCF.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.variantutils; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; - -import java.util.*; - -/** - * Converts variants from the Complete Genomics VAR format to VCF format. - */ -@Requires(value={},referenceMetaData=@RMD(name=CGVarToVCF.INPUT_ROD_NAME, type=VariantContext.class)) -@Reference(window=@Window(start=-40,stop=400)) -public class CGVarToVCF extends RodWalker { - - public static final String INPUT_ROD_NAME = "variant"; - - @Output(doc="File to which variants should be written", required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=true) - protected String sampleName = null; - - public void initialize() { - HashSet samples = new HashSet(1); - samples.add(sampleName); - vcfWriter.writeHeader(new VCFHeader(new HashSet(), samples)); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Collection contexts = tracker.getVariantContexts(ref, INPUT_ROD_NAME, null, ref.getLocus(), true, false); - - // for now, we don't support the mixed type - if ( contexts.size() == 0 || contexts.size() > 2 ) - return 0; - - Iterator iter = contexts.iterator(); - if ( contexts.size() == 1 ) { - writeHet(iter.next(), ref.getBase()); - } else { - VariantContext vc1 = iter.next(); - VariantContext vc2 = iter.next(); - if ( vc1.getType().equals(vc2.getType()) ) - writeHom(vc1, ref.getBase()); - } - - return 0; - } - - private void writeHet(VariantContext vc, byte ref) { - List alleles = new ArrayList(vc.getAlleles()); - Genotype g = new Genotype(sampleName, alleles); - write(vc, ref, g); - } - - private void writeHom(VariantContext vc, byte ref) { - List alleles = new ArrayList(2); - alleles.add(vc.getAlternateAllele(0)); - alleles.add(vc.getAlternateAllele(0)); - Genotype g = new Genotype(sampleName, alleles); - write(vc, ref, g); - } - - private void write(VariantContext vc, byte ref, Genotype g) { - HashMap genotypes = new HashMap(1); - genotypes.put(sampleName, g); - vc = VariantContext.modifyGenotypes(vc, genotypes); - if ( vc.isSNP() ) - vc = VariantContext.modifyLocation(vc, vc.getChr(), vc.getStart()+1, vc.getStart()+1); - vcfWriter.add(vc, ref); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java deleted file mode 100644 index f21903acf..000000000 --- a/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.playground.sample; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; - -import java.util.ArrayList; -import java.util.HashMap; - -/** - * Extends locus walker to print how many reads there are at each locus, by population - */ -public class CountLociByPopulationWalker extends LocusWalker implements TreeReducible { - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - // in this HashMap, we'll keep count of how many - HashMap count = new HashMap(); - - ArrayList reads = (ArrayList) context.getBasePileup().getReads(); - - for (SAMRecord read : reads) { - - // get the sample - Sample sample = getToolkit().getSampleByRead(read); - if (sample == null) - return 1; - - if (!count.containsKey(sample.getPopulation())) { - count.put(sample.getPopulation(), 1); - } - count.put(sample.getPopulation(), count.get(sample.getPopulation()) + 1); - } - - System.out.println("\n\n\n***** LOCUS: " + ref.getLocus().toString() + " *****"); - for (String population : count.keySet()) { - System.out.println(String.format("%s | %d", population, count.get(population))); - } - - return 1; - } - - public Long reduceInit() { return 0l; } - - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Reduces two subtrees together. In this case, the implementation of the tree reduce - * is exactly the same as the implementation of the single reduce. - */ - public Long treeReduce(Long lhs, Long rhs) { - return lhs + rhs; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java deleted file mode 100644 index 20cb7fe3d..000000000 --- a/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.playground.sample; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountMalesWalker extends ReadWalker { - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - Sample sample = getToolkit().getSampleByRead(read); - return sample.isMale() ? 1 : 0; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/tools/BCMMarkDupes.java b/java/src/org/broadinstitute/sting/playground/tools/BCMMarkDupes.java deleted file mode 100755 index 5e8b22807..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/BCMMarkDupes.java +++ /dev/null @@ -1,122 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import net.sf.samtools.*; -import java.io.*; -import java.util.*; - - -/** - * - * Marks duplicate reads in a sam or bam file based on start point and strand of the alignment. - * Requires: java samtools "picard". java 1.5 or better. - * - * Note, will not unmark duplicates. The final number of duplicates marked at the end is not necess. the total number of marked duplicates in the final output file. - * bam or sam type is inferred by the number of arguments, read order, header, filetype is preserved in the output file. - * - * Usage: [index file if using bam] - * @author bainbrid - * - */ -public class BCMMarkDupes -{ - final static String inHT = "ht"; - - /** - * Usage: [index file if using bam] - * @param args - * @throws Exception - */ - public static void main(String[] args) throws Exception - { - int cnt = 0; - int dcnt = 0; - - // begin hack by kiran - - /* - if(args.length != 2 && args.length !=3 ) - { - throw new Exception("Invalid number of arguments.\n"+usage()); - } - boolean isBam = args.length == 3; - String inputfile = args[0]; - String index = null; - if(isBam) index = args[1]; - String outputfile = args[1]; - if(isBam) index = args[2]; - */ - - String inputfile = args[0]; - String index = inputfile + ".bai"; - String outputfile = args[1]; - boolean isBam = true; - - // end hack by kiran - - //set the reader and writer, and header - SAMFileReader sfr = null; - if(isBam) - sfr = new SAMFileReader(new File(inputfile), new File(index)); - else - sfr = new SAMFileReader(new File(inputfile)); - Iterator iter = sfr.iterator(); - SAMFileWriter fw = null; - SAMFileHeader header = sfr.getFileHeader(); - if(isBam) - fw = (new SAMFileWriterFactory()).makeBAMWriter(header, true, new File(outputfile)); - else - fw = (new SAMFileWriterFactory()).makeSAMWriter(header, true, new File(outputfile)); - Hashtable ht = new Hashtable(2000000); - - //iterate through file and mark dupes - while(iter.hasNext()) - { - - SAMRecord rec = iter.next(); - int start = 0; - String strand = "N"; - if(rec.getReadNegativeStrandFlag()) - { - start = rec.getAlignmentEnd(); - //start = rec.getUnclippedEnd(); - } - else - { - strand = "P"; - start = rec.getAlignmentStart(); - //start = rec.getUnclippedStart(); - } - if(start == 0) continue; - cnt++; - String chromo = rec.getReferenceName(); - String key = chromo+strand+start; - String s = ht.get(key); - if(s == null) - { - ht.put(key, inHT); - //rec.setDuplicateReadFlag(false); - } - else - { - rec.setDuplicateReadFlag(true); - dcnt++; - } - fw.addAlignment(rec); - } - fw.close(); - System.err.println("Total records:"+cnt); - System.err.println("Records marked as duplicates:"+dcnt); - System.err.println("HT:"+ht.size()); - - } - - - - - public static String usage() - { - return "usage:\n\t [index file if using bam] "; - - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/tools/FilterReads.java b/java/src/org/broadinstitute/sting/playground/tools/FilterReads.java deleted file mode 100644 index 0b519af4c..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/FilterReads.java +++ /dev/null @@ -1,112 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.cmdline.Option; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; - -import java.io.File; -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 11, 2009 - * Time: 5:13:58 PM - * To change this template use File | Settings | File Templates. - */ -public class FilterReads extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Filters reads: the output file will contain only reads satisfying all the selected criteria"; - @Option(shortName="I", doc="Input file (bam or sam) to extract reads from.", - optional=false) public File IN = null; - @Option(shortName="O",doc="Output file (bam or sam) to write extracted reads to.", - optional=false) public File OUT = null; - @Option(shortName="U", doc="Select only unmapped reads if true; only mapped reads if false; both if not specified.", - optional=true) public Boolean UNMAPPED = null; - @Option(shortName="MINQ", doc="Select only reads with minimum base quality across all bases at or above the specified value.", - optional=true) public Integer MIN_QUAL = 0; - @Option(shortName="AVQ", doc="Select only reads with average base quality at or above the specified value.", - optional=true) public Double AVERAGE_QUAL = 0.0; - @Option(shortName="MAPQ", doc="Select only reads with mapping quality at or above the specified value (does not affect unmapped reads, use 'U').", - optional=true) public Integer MAPPING_QUAL = 0; - @Option(shortName="MAXE",doc="Select only reads with edit distance from the reference at or below the specified value ('NM' tags must be present in the input file).", - optional = true) public Integer MAX_ERRORS = INFINITY; - @Option(shortName="MINE",doc="Select only reads with edit distance from the reference at or above the specified value ('NM' tags must be present in the input file).", - optional = true) public Integer MIN_ERRORS = 0; - - private static int INFINITY = 1000000; - UnmappedFilter uFilter; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new FilterReads().instanceMain(argv)); - } - - protected int doWork() { - - if ( UNMAPPED == null ) uFilter = UnmappedFilter.BOTH; - else { - if ( UNMAPPED.booleanValue() ) uFilter = UnmappedFilter.UNMAPPED; - else uFilter = UnmappedFilter.MAPPED; - } - - - SAMFileReader inReader = new SAMFileReader(IN); - - SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(inReader.getFileHeader(), true, OUT) ; - - for ( SAMRecord read : inReader ) { - switch ( uFilter ) { - case UNMAPPED: if ( ! read.getReadUnmappedFlag() ) continue; break; - case MAPPED: if ( read.getReadUnmappedFlag() ) continue; break; - } - - if ( ! read.getReadUnmappedFlag() ) { - // these filters are applicable only to mapped reads: - if ( read.getMappingQuality() < MAPPING_QUAL ) continue; - if ( MAX_ERRORS < INFINITY ) { - Object attr = read.getAttribute("NM"); - if ( attr != null ) { - int nm = (Integer)attr; - if ( nm > MAX_ERRORS ) continue; - if ( nm < MIN_ERRORS ) continue; - } - } - } - - - if ( MIN_QUAL > 0 || AVERAGE_QUAL > 0 ) { - byte[] quals = read.getBaseQualities(); - double av_q = 0.0; - boolean passed = true; - for ( int i = 0 ; i < quals.length ; i++ ) { - if ( quals[i] < MIN_QUAL ) { - passed = false; - break; - } - av_q += (double)quals[i]; - } - if ( ! passed ) continue; - if ( av_q / read.getReadLength() < AVERAGE_QUAL ) continue; - } - - outWriter.addAlignment(read); - } - - inReader.close(); - outWriter.close(); - return 0; - } - - enum UnmappedFilter { - UNMAPPED, MAPPED, BOTH - } - -} - - diff --git a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java deleted file mode 100644 index 0e81b6214..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.cmdline.Option; -import net.sf.samtools.*; - -import java.io.File; -import java.util.List; -import java.util.ArrayList; - -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.playground.utils.ParallelSAMIterator; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Aug 27, 2009 - * Time: 3:53:48 PM - * To change this template use File | Settings | File Templates. - */ -public class PairMaker extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Reconstitutes mate pairs from alignments"+ - " for individual fragment ends. Individual end alignment files are expected to be sorted by"+ - " read name. Multiple alignments are allowed and in this case the single best pairing will be selected."; - @Option(shortName="I1", - doc="Input file (bam or sam) with alignments for end 1 (first mate in a pair).", - optional=false) - public File IN1 = null; - @Option(shortName="I2", - doc="Input file (bam or sam) with alignments for end 2 (second mate in a pair).", - optional=false) - public File IN2 = null; - @Option(shortName="O",optional=false, doc="Output file to write found/selected pairs into.") - public File OUTPUT = null; - @Option(shortName="F",doc="Turns on a 'filter' mode: only records/pairs passing the filter will be written "+ - "into the output file. Filter condition is a logical combination (using parentheses for grouping, "+ - "& for AND, | for OR, = for specifying values) of the primitives listed below. Primitives that end with "+ - " 1 or 2 apply specifically to pair end 1 or 2, respectively; when, in addition to primitives

1 and

2, " + - " a primitive

is also defined, it is always interpreted as

1 &

2. Primitives: PAIRSONLY "+ - "(print only pairs=both ends mapped), MINQ1=, MINQ2=, MINQ= (minimum alignment quality if read is mapped) "+ - " for MINQ1= & MINQ2=), ERRDIFF1=, ERRDIFF2= " + - "(when next-best alignment(s) are available", - optional = true) - public String FILTER = null; - @Option(shortName="Q", optional=true, doc="Minimum mapping quality required on both ends in order to accept the pair.") - public Integer MINQ = -1; - - public static int INFINITY = 1000000000; - - // we will collect some stats along the way: - private int fragments_seen = 0; - private int end1_missing = 0; - private int end2_missing = 0; - private int end1_unmapped = 0; - private int end2_unmapped = 0; - private int end1_unpaired_unmapped = 0; - private int end2_unpaired_unmapped = 0; - private int both_unmapped = 0; - private int both_mapped = 0; - private int both_unique = 0; - private int proper_pair = 0; - private int proper_unique_pair = 0; - private int outer_pair = 0; - private int side_pair = 0; - private int inter_chrom = 0; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new PairMaker().instanceMain(argv)); - } - - protected int doWork() { - - SAMFileReader end1Reader = new SAMFileReader(IN1); - SAMFileReader end2Reader = new SAMFileReader(IN2); - - SAMFileHeader h = checkHeaders(end1Reader.getFileHeader(), end2Reader.getFileHeader() ); - - SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, true, OUTPUT); - - ParallelSAMIterator pi = new ParallelSAMIterator(end1Reader, end2Reader); - - List end1 ; - List end2 ; - - SAMRecord r1 = null, r2 = null; // alignments selected for ends 1 and 2, respectively, out of (possibly) multiple alternative placements - - while ( pi.hasNext() ) { - - fragments_seen++; - - Pair< List, List > ends = pi.next(); - - end1 = ends.getFirst(); - end2 = ends.getSecond(); - - if ( end1.size() == 0 ) { - // nothing at all for end1, choose best from end2 and save - end1_missing++; - r1 = null; - r2 = selectBestSingleEnd(end2); - if ( AlignmentUtils.isReadUnmapped(r2) ) end2_unpaired_unmapped++; - setPairingInformation(r1,r2); - outWriter.addAlignment(r2); - continue; - } - if ( end2.size() == 0 ) { - // nothing at all for end2, choose best from end1 and save - end2_missing++; - r1 = selectBestSingleEnd(end1); - r2 = null; - if ( AlignmentUtils.isReadUnmapped(r1) ) end1_unpaired_unmapped++; - setPairingInformation(r1,r2); - outWriter.addAlignment(r1); - continue; - } - - // we got reads on both sides - - if ( end1.size() == 1 && end2.size() == 1 ) { - // unique alignments on both ends: not much to do, just save as a pair - r1 = end1.get(0); - r2 = end2.get(0); - if ( AlignmentUtils.isReadUnmapped(r1) ) { - end1_unmapped++; - if ( AlignmentUtils.isReadUnmapped(r2) ) { end2_unmapped++; both_unmapped++; } - } else { - if ( AlignmentUtils.isReadUnmapped(r2)) end2_unmapped++; - else { - // both mapped - both_mapped++; - - boolean unique = false; - if ( r1.getMappingQuality() > 0 && - r2.getMappingQuality() > 0 ) { unique = true; both_unique++; } - - if ( r1.getReferenceIndex() != r2.getReferenceIndex() ) { - inter_chrom++; - } else { - switch ( orientation(r1,r2) ) { - case INNER : proper_pair++; if ( unique ) proper_unique_pair++; break; - case OUTER: outer_pair++; break; - case LEFT: - case RIGHT: side_pair++; break; - default: - } - } - } - } - - setPairingInformation(r1,r2); - outWriter.addAlignment(r1); - outWriter.addAlignment(r2); - continue; - } - - if ( end1.size() == 1 && AlignmentUtils.isReadUnmapped(end1.get(0)) ) { - // special case: multiple alignments for end2 but end1 is unmapped: just select best for end2 - r1 = end1.get(0); - r2 = selectBestSingleEnd(end2); - end1_unmapped++; - if ( AlignmentUtils.isReadUnmapped(r2) ) { end2_unmapped++; both_unmapped++; } - setPairingInformation(r1,r2); - outWriter.addAlignment(r1); - outWriter.addAlignment(r2); - continue; - } - - if ( end2.size() == 1 && AlignmentUtils.isReadUnmapped(end2.get(0)) ) { - // special case: multiple alignments for end1 but end2 is unmapped: just select best for end1 - r1 = selectBestSingleEnd(end1); - r2 = end2.get(0); - end2_unmapped++; - if ( AlignmentUtils.isReadUnmapped(r1) ) { end1_unmapped++; both_unmapped++; } - setPairingInformation(r1,r2); - outWriter.addAlignment(r1); - outWriter.addAlignment(r2); - continue; - } - - // ok, if we are here then we got both ends mapped and multiple alignments in at least one end. - // Let's loop through candidates and choose the best pair: -/* - List good = new ArrayList(); - List bad = new ArrayList(); - double best_good = INFINITY; - - for ( SAMRecord candidate1 : end1 ) { - for ( SAMRecord candidate2 : end2 ) { - if ( candidate1.getReferenceIndex() == candidate2.getReferenceIndex() - && orientation(candidate1,candidate2)==PairOrientation.INNER ) { - double score = pairingScore(candidate1, candidate2); - } - } - } -*/ - Pair bestPair = selectBestPair(end1,end2); - -// r1 = selectUniqueSingleEnd(end1,MINQ.intValue()); -// r2 = selectUniqueSingleEnd(end2,MINQ.intValue()); - - r1 = bestPair.first; - r2 = bestPair.second; - - both_mapped++; - if ( r1.getMappingQuality() > 0 && r2.getMappingQuality() > 0 ) both_unique++; - if ( r1.getReferenceIndex() == r2.getReferenceIndex() && - orientation(r1,r2) == PairOrientation.INNER ) { - proper_pair++; - } - if ( r1.getMappingQuality() > 0 && r2.getMappingQuality() > 0 && - r1.getReferenceIndex() == r2.getReferenceIndex() && - orientation(r1,r2) == PairOrientation.INNER ) { - proper_unique_pair++; - } - setPairingInformation(r1,r2); - outWriter.addAlignment(r1); - outWriter.addAlignment(r2); - - - } - - pi.close(); - outWriter.close(); - - System.out.println(); - System.out.println("Total fragments (read pairs): "+fragments_seen); - - System.out.println("Unpaired end1 reads (end2 missing): "+end2_missing); - System.out.println("Unpaired end1 reads (end2 missing), unmapped: "+end1_unpaired_unmapped); - System.out.println("Unpaired end2 reads (end1 missing): "+end1_missing); - System.out.println("Unpaired end2 reads (end1 missing), unmapped: "+end2_unpaired_unmapped); - System.out.println("Pairs with end1 unmapped (regardless of end2 status): "+end1_unmapped); - System.out.println("Pairs with end2 unmapped (regardless of end1 status): "+end2_unmapped); - System.out.println("Pairs with both ends unmapped: "+both_unmapped); - System.out.println("Pairs with both ends mapped: "+both_mapped); - System.out.println("Pairs with both ends mapped uniquely (MQ>0): "+both_unique); - System.out.println("Pairs with both ends mapped properly: "+proper_pair); - System.out.println("Pairs with both ends mapped uniquely and properly: "+proper_unique_pair); - System.out.println(); - return 0; - } - - private Query > parseConditions(String filter) { - - filter = filter.trim(); - Query> result1, result2; - - int level = 0; // parentheses level - - for ( int i = 0 ; i < filter.length() ; i++ ) { - switch ( filter.charAt(i) ) { - case '(': level++; break; - case ')': level--; - if ( level < 0 ) throw new RuntimeException("Too many closing parentheses in the expression."); - break; - case '&': if ( level > 0 ) break; // parenthised expression - not now! - // we are at level 0: parse expressions to the left and to the right of '&' operator - return new CompositeQuery >(parseConditions(filter.substring(0,i)), - parseConditions(filter.substring(i+1)), - Query.Operator.AND); - case '|': if ( level > 0 ) break; // inside parenthised expression - keep scanning, we'll process the whole expression later - // we are at level 0: parse expressions to the left and to the right of '|' operator - return new CompositeQuery >(parseConditions(filter.substring(0,i)), - parseConditions(filter.substring(i+1)), - Query.Operator.OR); - default: break; - } - } - - if ( level > 0 ) throw new RuntimeException("Too many opening parentheses in the expression."); - - // if we ended up here, this is either a single parenthized expression or a primitive. - // filter was trimmed; if it is a parenthized expression, ( and ) should be first/last symbols: - if ( filter.charAt(0) == '(' && filter.charAt(filter.length()-1) == ')') - return parseConditions(filter.substring(1,filter.length()-1)); - - // ok, it's a primitive: - int equal_pos = filter.indexOf('='); - if ( equal_pos < 0 ) { // it's not a = expression, but a logical primitive - if ( "PAIRSONLY".equals(filter) ) return new BothEndsMappedQuery(); - } - - return null; - } - - /** - * Utility method: checks if the two headers are the same. Returns the first one if they are, - * a non-NULL one if the other one is NULL, or NULL if both headers are NULL. If headers are - * both not NULL and are not the same, a RuntimeException is thrown. - * @param h1 - * @param h2 - * @return true if the headers are the same - */ - private SAMFileHeader checkHeaders(SAMFileHeader h1, SAMFileHeader h2) { - - if ( h1 == null ) return h2; - if ( h2 == null ) return h1; - -// if ( ! h1.getReadGroups().equals(h2.getReadGroups())) throw new RuntimeException("Read groups in the two input files do not match"); -// if ( ! h1.getSequenceDictionary().equals(h2.getSequenceDictionary()) ) throw new RuntimeException("Sequence dictionaries in the two input files do not match"); -// if ( ! h1.getProgramRecords().equals(h2.getProgramRecords()) ) throw new RuntimeException("Program records in the two input files do not match"); - if ( ! h1.equals(h2) ) throw new RuntimeException("Headers in the two input files do not match"); - return h1; - } - - /** Given a list of alignments, returns the one with the best mapping quality score. - * If there is more than one alignment with the same score (got to be 0), one of - * these best-scoring alignments will be returned at random. - * @param l - * @return - */ - private SAMRecord selectBestSingleEnd(List l) { - if ( l.size() == 0 ) return null; // should not happen; just don not want to crash here, but somewhere else - if ( l.size() == 1 ) return l.get(0); // not much choice... - int best_qual = -1; - int n_unmapped = 0; - List best = new ArrayList(); - - for ( SAMRecord r : l ) { - if ( r.getReadUnmappedFlag() ) { - // paranoid; if there are ANY alignments available, there should not be any "unmapped" records among them; - // and if the read is "unmapped" indeed, then there should be no other alignments reported - n_unmapped++; - continue; - } - if ( r.getMappingQuality() > best_qual) { - best_qual = r.getMappingQuality(); - best.clear(); - best.add(r); - continue; - } - if ( r.getMappingQuality() == best_qual ) best.add(r); - } - if ( best.size() == 0 ) throw new RuntimeException("Currently Unsupported: SAM file might be not fully compliant. "+ - "Multiple 'unmapped' records found for read "+l.get(0).getReadName()); - if ( best.size() == 1 ) return best.get(0); - if ( best_qual != 0 ) throw new RuntimeException("Multiple alignments for the same read found with non-zero score. "+ - "Read: "+l.get(0).getReadName()+" best score: "+best_qual); - return best.get((int)(Math.random()*best.size())); - } - - private Pair selectBestPair(List end1, List end2) { - SAMRecord r1 = selectBestSingleEnd(end1); - SAMRecord r2 = selectBestSingleEnd(end2); - - if ( AlignmentUtils.isReadUnmapped(r1) || AlignmentUtils.isReadUnmapped(r2) ) { - throw new RuntimeException("Unmapped read in selectBestPair: should never happen. read1="+r1.toString()+"; read2="+r2.toString()); - } - - if ( r1.getMappingQuality() > 0 && r2.getMappingQuality() > 0 ) { - // we got best placements for the reads - return new Pair(r1,r2); - } - - // at least one alignment is non-unique - - List toChooseFrom = new ArrayList(); - - if ( r1.getMappingQuality() > 0 ) { - // r2 is non unique - for ( SAMRecord r : end2 ) { - if ( r.getReferenceIndex().intValue() == r1.getReferenceIndex().intValue() ) { - toChooseFrom.add(r); - } - } - if ( toChooseFrom.size() == 1 ) { - return new Pair(r1,toChooseFrom.get(0)); - } else { - if ( toChooseFrom.size() > 1 ) { - return new Pair(r1,toChooseFrom.get((int)(Math.random()*toChooseFrom.size()))); - } else { - return new Pair(r1,end2.get((int)(Math.random()*end2.size()))); - } - } - } - - if ( r2.getMappingQuality() > 0 ) { - // r1 is non unique - for ( SAMRecord r : end1 ) { - if ( r.getReferenceIndex().intValue() == r2.getReferenceIndex().intValue() ) { - toChooseFrom.add(r); - } - } - if ( toChooseFrom.size() == 1 ) { - return new Pair(toChooseFrom.get(0),r2); - } else { - if ( toChooseFrom.size() > 1 ) { - return new Pair(toChooseFrom.get((int)(Math.random()*toChooseFrom.size())),r2); - } else { - return new Pair(end2.get((int)(Math.random()*end2.size())),r2); - } - } - } - - // both are non-unique - List> toChooseFromP = new ArrayList>(); - for ( SAMRecord rr1 : end1 ) { - for ( SAMRecord rr2 : end2 ) { - if ( rr1.getReferenceIndex().intValue() == rr2.getReferenceIndex().intValue() ) { - toChooseFromP.add ( new Pair(rr1,rr2) ); - } - } - } - if ( toChooseFrom.size() == 1 ) { - return toChooseFromP.get(0); - } else { - if ( toChooseFrom.size() > 1 ) { - return toChooseFromP.get((int)(Math.random()*toChooseFromP.size())); - } else { - return new Pair(end1.get((int)(Math.random()*end1.size())), - end2.get((int)(Math.random()*end2.size()))); - } - } - - } - - private SAMRecord selectUniqueSingleEnd(List l, int minq) { - if ( l.size() == 0 ) return null; // should not happen; just don not want to crash here, but somewhere else - if ( l.size() == 1 ) { - if ( l.get(0).getMappingQuality() >= minq ) return l.get(0); - else return null; // not unique enough - } - - int n_unmapped = 0; - List best = new ArrayList(); - - for ( SAMRecord r : l ) { - if ( AlignmentUtils.isReadUnmapped(r) ) { - // paranoid; if there are ANY alignments available, there should not be any "unmapped" records among them; - // and if the read is "unmapped" indeed, then there should be no other alignments reported - n_unmapped++; - continue; - } - if ( r.getMappingQuality() >= minq ) { - best.add(r); - continue; - } - } - if ( best.size() == 0 ) return null; // no unique alignment - if ( best.size() > 1 ) { - for ( SAMRecord r : best ) { - System.out.println("READ "+r.getReadName()+" mapQ="+r.getMappingQuality()+" at="+r.getReferenceName()+ - ":"+r.getAlignmentStart()+"("+(r.getReadNegativeStrandFlag()?"-":"+")+") cig="+r.getCigarString()); - } - throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq); - } - - return best.get(0); - } - - /** - * Assumes that alignments r1 and r2 are the two ends of a selected mate pair, and sets pairing flags and reciprocal mate - * mapping values for each of them (so that e.g. r2.getMateAlignmentStart() is properly set to - * r1.getAlignmentStart(), etc). Any one of the two alignments can be null, in which case it is assumed - * that the corresponding end is unmapped, and the flags/values in the other end will be set accordingly. - * If both r1 and r2 are null, an exception will be thrown. - * @param r1 first end in a mate pair - * @param r2 second end in a mate pair - */ - private void setPairingInformation(SAMRecord r1, SAMRecord r2) { - // set mate information (note that r1 and r2 can not be 'null' simultaneously): - - if ( r1 == null && r2 == null ) throw new RuntimeException("Both ends of the mate pair are passed as 'null'"); - - // take care of unpaired reads: - if ( r2 == null ) { - r1.setReadPairedFlag(false); - r1.setMateReferenceIndex( SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ); - r1.setMateNegativeStrandFlag( false ); - r1.setMateAlignmentStart( SAMRecord.NO_ALIGNMENT_START ); - return; - } - - if ( r1 == null ) { - r2.setReadPairedFlag(false); - r2.setMateReferenceIndex( SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ); - r2.setMateNegativeStrandFlag( false ); - r2.setMateAlignmentStart( SAMRecord.NO_ALIGNMENT_START ); - return; - } - - // we got both reads - - r1.setReadPairedFlag(true); - r2.setReadPairedFlag(true); - - boolean r1unmapped = AlignmentUtils.isReadUnmapped(r1); - boolean r2unmapped = AlignmentUtils.isReadUnmapped(r2); - - r1.setMateUnmappedFlag( r2unmapped ); - r1.setMateReferenceIndex( r2unmapped ? SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX : r2.getReferenceIndex() ); - r1.setMateNegativeStrandFlag(r2unmapped ? false : r2.getReadNegativeStrandFlag() ); - r1.setMateAlignmentStart( r2unmapped ? SAMRecord.NO_ALIGNMENT_START : r2.getAlignmentStart()); - - r1.setFirstOfPairFlag(true); - r1.setSecondOfPairFlag(false); - - r2.setMateUnmappedFlag( r1unmapped ); - r2.setMateReferenceIndex( r1unmapped ? SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX : r1.getReferenceIndex() ); - r2.setMateNegativeStrandFlag(r1unmapped ? false : r1.getReadNegativeStrandFlag() ); - r2.setMateAlignmentStart( r1unmapped ? SAMRecord.NO_ALIGNMENT_START : r1.getAlignmentStart()); - r2.setFirstOfPairFlag(false); - r2.setSecondOfPairFlag(true); - } - - /** - * Returns fragment length inferred from the two alignments, or 1,000,000,000 if reads align - * on different chromosomes or are not mapped at all. NOTE: the returned fragment length is - * start+read length of the rightmost alignment minus start of the leftmost one; it makes - * sense only for "proper" pairs (leftmost forward, rightmost reverse); for all other pairs - * the returned number does not allow for any meaningful interpretation. - * @param r1 - * @param r2 - * @return - */ - private int fragmentSize(final SAMRecord r1, final SAMRecord r2) { - if ( r1 == null || AlignmentUtils.isReadUnmapped(r1) || - r2 == null || AlignmentUtils.isReadUnmapped(r2) || - r1.getReferenceIndex() != r2.getReferenceIndex() ) return INFINITY; - if ( r1.getAlignmentStart() <= r2.getAlignmentStart() ) - return ( r2.getAlignmentStart() + r2.getReadLength() - r1.getAlignmentStart()); - else return ( r1.getAlignmentStart() + r1.getReadLength() - r2.getAlignmentStart()); - } - - enum PairOrientation { - INNER, OUTER, LEFT, RIGHT, NONE - } - - /** - * Returns orientation of the pair: INNER for "--> <--", OUTER for "<-- -->", - * LEFT for "<-- <--" and RIGHT for "--> -->" (regardless of which read in a pair, 1 or 2, - * actually maps to the left and which to the right). If any of the reads is null or unmapped, returns NONE. - * If reads are on different contigs, they are still ordered according to the underlying contig order (by reference - * index), and the returned value reflects their relative orientation as described above (however it does not seem - * to be very useful in that case). - * @param r1 - * @param r2 - * @return - */ - private PairOrientation orientation(SAMRecord r1, SAMRecord r2) { - - if ( r1 == null || r2 == null || AlignmentUtils.isReadUnmapped(r1) || AlignmentUtils.isReadUnmapped(r2)) - return PairOrientation.NONE; - - SAMRecord left, right; - - if ( r1.getReferenceIndex() == r2.getReferenceIndex() ) { - if ( r1.getAlignmentStart() <= r2.getAlignmentStart() ) { - left = r1; - right = r2; - } else { - left = r2; - right = r1; - } - } else { - if ( r1.getReferenceIndex() < r2.getReferenceIndex() ) { - left = r1; - right = r2; - } else { - left = r2; - right = r1; - } - } - - if ( ! left.getReadNegativeStrandFlag() ) { // left is forward - if ( right.getReadNegativeStrandFlag() ) return PairOrientation.INNER; // left is forward, right is reverse - else return PairOrientation.RIGHT; // left is forward, right is forward - } else { // left is reverse - if ( right.getReadNegativeStrandFlag() ) return PairOrientation.LEFT; // left is reverse, right is reverse - else return PairOrientation.OUTER; // left is reverse, right is forward - } - } - - class Pairing { - SAMRecord r1; - SAMRecord r2; - double score; - - Pairing() { - this(null,null,INFINITY); - } - - Pairing(SAMRecord r1, SAMRecord r2) { - this(r1,r2,INFINITY); - } - Pairing(SAMRecord r1, SAMRecord r2, double score) { - this.r1 = r1; - this.r2 = r2; - this.score = score; - } - - SAMRecord getFirst() { return r1; } - SAMRecord getSecond() { return r2; } - double getScore() { return score; } - - void setFirst(SAMRecord r) { r1 = r; } - void setSecond(SAMRecord r) { r2 = r; } - void setScore(double score) { this.score = score; } - } - - private double pairingScore(final SAMRecord r1, final SAMRecord r2) { - - return ( ( r1.getMappingQuality() + r2.getMappingQuality() ) * Math.exp(1)) ; - } - - interface Query { - boolean isSatisfied(T record) ; - enum Operator { OR, AND }; - } - - class CompositeQuery implements Query { - private Query q1; - private Query q2; - private Query.Operator type ; // 1 for 'and', 0 for 'or' - - CompositeQuery(Query q1, Query q2, Query.Operator type) { - this.q1 = q1; - this.q2 = q2; - this.type = type; - } - - public boolean isSatisfied(T record) { - switch ( type ) { - case AND: return q1.isSatisfied(record) && q2.isSatisfied(record); - case OR: return q1.isSatisfied(record) || q2.isSatisfied(record); - default: throw new IllegalStateException("Unknown composite query operator"); - } - } - } - - class BothEndsMappedQuery implements Query< Pair > { - public boolean isSatisfied(Pair p) { - return ( ! AlignmentUtils.isReadUnmapped(p.getFirst()) && ! AlignmentUtils.isReadUnmapped(p.getSecond())) ; - } - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java b/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java deleted file mode 100644 index f951b0c2e..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools; - -import java.io.File; -import java.util.Comparator; -import java.util.Set; -import java.util.TreeSet; -import java.util.Iterator; -import java.util.Map.Entry; - -import org.broadinstitute.sting.playground.utils.GenomicMap; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.reference.ReferenceSequenceFileWalker; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader.SortOrder; -import net.sf.samtools.SAMFileReader.ValidationStringency; - - -public class RemapAlignments extends CommandLineProgram { - - // Usage and parameters - @Usage(programVersion="1.0") public String USAGE = "Remaps custom-reference (e.g. transcriptome) alignments onto the genomic reference\n"; - @Option(shortName="M", - doc="Map file: from the reference the reads were aligned to, to the master reference the alignments should be remapped to. "+ - "In other words, for each custom-reference contig C this map must provide a (possibly disjoint) list of intervals "+ - "on the target reference, onto which C maps base-by-base. ", - optional=false) - public File MAP_FILE = null; - @Option(shortName="I", - doc="Input file (bam or sam) with alignments to be remapped", - optional=false) - public File IN = null; - @Option(shortName="O", - doc="File to write remapped reads to.", - optional=false) - public File OUT = null; - @Option(shortName="R", - doc="Target reference to remap alignments onto.", - optional=false) - public File REFERENCE = null; - @Option( - doc="If a read has multiple alignments that are exactly the same after remapping, "+ - "then keep only one copy of such alignment in output file. Multiple alignments that are "+ - "not equivalent after remapping are not affected by this flag. "+ - "Multiple alignments for the same query must be grouped on adjacent lines of the input file to be detected "+ - "(i.e. input file must be sorted by read name), " + - "otherwise REDUCE will have no effect.", - optional=true) - public boolean REDUCE = false; - - - private GenomicMap map = null; - private String lastReadName = null; - private int totalReads = 0; - private int totalRecords = 0; - private int badRecords = 0; - private int totalUnmappedReads = 0; - private int writtenRecords = 0; - - private Set remappedReads = null; - private SAMFileWriter writer = null; - private SAMFileReader reader = null; - - private static int [] g_log_n; // copied from bwa - - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new RemapAlignments().instanceMain(argv)); - } - - protected int doWork() { - - g_log_n = new int[256]; - for (int i = 1; i < 256; ++i) g_log_n[i] = (int)(4.343 * Math.log(i) + 0.5); - - reader = new SAMFileReader(IN); - reader.setValidationStringency(ValidationStringency.SILENT); - SAMFileHeader oldHeader = reader.getFileHeader(); - if ( oldHeader == null ) throw new RuntimeException("Failed to retrieve SAM file header from the input bam file"); - - if ( REDUCE && oldHeader.getSortOrder() != SortOrder.queryname ) - System.out.println("WARNING: Input file is not sorted by query name, REDUCE may have no effect. Sort order: " - +oldHeader.getSortOrder()); - - remappedReads = new TreeSet(new AlignmentComparator()); - - SAMFileHeader h = new SAMFileHeader(); - - for ( Entry attr : oldHeader.getAttributes() ) h.setAttribute(attr.getKey(), attr.getValue()); - h.setGroupOrder(oldHeader.getGroupOrder()); - h.setProgramRecords(oldHeader.getProgramRecords()); - h.setReadGroups(oldHeader.getReadGroups()); - - if ( oldHeader.getSortOrder() == SortOrder.queryname ) { - h.setSortOrder(SortOrder.queryname); - } else { - h.setSortOrder(SortOrder.unsorted); - } - - ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(REFERENCE); - - if ( reference.getSequenceDictionary() == null ) { - System.out.println("No reference sequence dictionary found. Aborting."); - reader.close(); - System.exit(1); - } - - h.setSequenceDictionary(reference.getSequenceDictionary()); - GenomeLocParser genomeLocParser = new GenomeLocParser(reference.getSequenceDictionary()); - - map = new GenomicMap(10000); - map.read(genomeLocParser,MAP_FILE); - System.out.println("Map loaded successfully: "+map.size()+" contigs"); - - - writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, true, OUT); - - for ( SAMRecord read : reader ) { - - - if ( map.remapToMasterReference(read,h,true) == null ) { - badRecords++; - continue; - } - if ( AlignmentUtils.isReadUnmapped(read) ) totalUnmappedReads++; - - // destroy mate pair mapping information, if any (we will need to reconstitute pairs after remapping both ends): - read.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); -// if ( read.getReadPairedFlag() ) System.out.println("PAIRED READ!!"); - - totalRecords++; - - if ( totalRecords % 1000000 == 0 ) System.out.println(totalRecords + " valid records processed"); - - - if ( ! read.getReadName().equals(lastReadName) ) { - totalReads++; - lastReadName = read.getReadName(); - - - if ( REDUCE ) { - - updateCountsAndQuals(remappedReads); - - for ( SAMRecord r : remappedReads ) { - writer.addAlignment(r); // emit non-redundant alignments for previous query - writtenRecords++; - } - remappedReads.clear(); - } - } - if ( REDUCE ) remappedReads.add(read); - else { - writer.addAlignment(read); - writtenRecords++; - } - } - - // write remaining bunch of reads: - if ( REDUCE ) { - updateCountsAndQuals(remappedReads); - for ( SAMRecord r : remappedReads ) { - writer.addAlignment(r); // emit non-redundant alignments for previous query - writtenRecords++; - } - } - - System.out.println("Total valid records processed: "+totalRecords); - System.out.println("Incorrect records (alignments across contig boundary) detected: "+badRecords + - " (discarded and excluded from any other stats)"); - System.out.println("Total reads processed: "+totalReads); - System.out.println("Total mapped reads: "+(totalReads-totalUnmappedReads)); - System.out.println("Average hits per mapped read: "+((double)(totalRecords-totalUnmappedReads))/(totalReads-totalUnmappedReads)); - System.out.println("Records written: "+writtenRecords); - System.out.println("Average hits per mapped read written (after reduction): " - +((double)(writtenRecords-totalUnmappedReads))/(totalReads-totalUnmappedReads)); - reader.close(); - writer.close(); - return 0; - } - - class AlignmentComparator implements Comparator { - - public int compare(SAMRecord r1, SAMRecord r2) { - if ( r1.getReferenceIndex() < r2.getReferenceIndex() ) return -1; - if ( r1.getReferenceIndex() > r2.getReferenceIndex() ) return 1; - if ( r1.getAlignmentStart() < r2.getAlignmentStart() ) return -1; - if ( r1.getAlignmentStart() > r2.getAlignmentStart() ) return 1; - return r1.getCigarString().compareTo(r2.getCigarString()); - } - - } - - private void updateCountsAndQuals(Set reads) { - if ( reads.size() == 1 ) { - SAMRecord r = reads.iterator().next(); - - // technically, if edit distance of the read is equal to max_diff used in alignments, - // we should have set 25... - if ( AlignmentUtils.isReadUnmapped(r) ) { - r.setMappingQuality(0); - } else { - r.setMappingQuality(37); - r.setAttribute("X0", new Integer(1)); - r.setAttribute("X1", new Integer(0)); - } - r.setNotPrimaryAlignmentFlag(false); - - } else { - - // we have multiple alignments for the read - // need to figure out how many best vs inferior alignments are there: - int minNM = 1000000; - int cnt = 0; // count of best alignments - Iterator it = reads.iterator(); - int n = reads.size(); // total number of (alternative) alignments for the given read. - boolean canComputeMapQ = true; - while ( it.hasNext() ) { - SAMRecord r = it.next(); - if ( AlignmentUtils.isReadUnmapped(r) && n > 1) { - // we do not want to keep unmapped records in the set unless it's the last and only record! - it.remove(); - n--; // one less alignment left in the current group of alignments - continue; - } - if ( ! canComputeMapQ ) continue; // some reads were missing NM attribute, so do not bother - we can not compute MapQ - Object attr = r.getAttribute("NM"); - if ( attr == null ) { - canComputeMapQ = false; // can not recompute qualities! - continue; - } else { - int nm; - if ( attr instanceof Short ) nm = ((Short)attr).intValue(); - else if ( attr instanceof Integer ) nm = ((Integer)attr).intValue(); - else throw new RuntimeException("NM attribute is neither Short nor Integer, don't know what to do."); - if ( nm < minNM ) { - minNM = nm; - cnt = 1; - } else if ( nm == minNM ) cnt++; - } - } - - if ( n == 1 ) { - SAMRecord r = reads.iterator().next() ; - if (AlignmentUtils.isReadUnmapped(r) ) { - // special case: we are left with a single unmapped alignment - r.setAttribute("X0", new Integer(0)); - r.setAttribute("X1", new Integer(0)); - return; - } - } - - // now reset counts of available alignments and mapping quals (if we can) in every alignment record: - for ( SAMRecord r : reads ) { - - int cnt2 = reads.size() - cnt; // count of inferior alignments - - r.setAttribute("X0", new Integer(cnt)); - r.setAttribute("X1", new Integer(cnt2)); - - if ( ! canComputeMapQ ) continue; // not all reads had NM field, so we can not recompute MapQ - - if ( cnt2 > 255 ) cnt2 = 255; // otherwise we will be out of bounds in g_log_n - - int nm_attr; - Object attr = r.getAttribute("NM"); - if ( attr instanceof Short ) nm_attr = ((Short)attr).intValue(); - else if ( attr instanceof Integer ) nm_attr = ((Integer)attr).intValue(); - else throw new RuntimeException("NM attribute is neither Short nor Integer, don't know what to do."); - if ( nm_attr == minNM ) { - - // one of the best alignments: - - r.setNotPrimaryAlignmentFlag(false); - if ( cnt == 1 ) { - // single best alignment; additional inferior alignments will only affect mapping qual - r.setMappingQuality( 23 < g_log_n[cnt2] ? 0 : 23 - g_log_n[cnt2] ); // this recipe for Q is copied from bwa - } else { - r.setMappingQuality(0); // multiple best alignments - mapping quality is 0 - } - } else { - - // secondary alignment ( we know we hold a better one) - r.setNotPrimaryAlignmentFlag(true); - r.setMappingQuality(0); // ??? should we set 0 for secondary?? - } - } - } - - } - -/* - private int bwa_approx_mapQ(SAMRecord r, int max_diff) { - int c1 = (Integer)r.getAttribute("X0"); - int c2 = (Integer)r.getAttribute("X1"); - int mm = (Integer)r.getAttribute("NM"); - if ( c1 > 0 ) return 0; - if ( c1 == 0 ) return 23; - if ( mm == max_diff ) return 25; - return 0; - } -*/ -} - - diff --git a/java/src/org/broadinstitute/sting/playground/tools/SliceBams.java b/java/src/org/broadinstitute/sting/playground/tools/SliceBams.java deleted file mode 100644 index c6cba41c3..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/SliceBams.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.PicardException; -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.StandardOptionDefinitions; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.io.IoUtil; -import net.sf.picard.sam.MergingSamRecordIterator; -import net.sf.picard.sam.SamFileHeaderMerger; -import net.sf.picard.util.Log; -import net.sf.samtools.*; -import net.sf.samtools.util.BlockCompressedOutputStream; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.SimplifyingSAMFileWriter; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.TimeUnit; - -/** - * Reads a list of BAM files and slices all of them into a single merged BAM file - * containing reads in overlapping chr:start-stop interval. - * - * @author Mark DePristo - */ -public class SliceBams extends CommandLineProgram { - private static final Log log = Log.getInstance(SliceBams.class); - - // Usage and parameters - @Usage - public String USAGE = "Merges multiple SAM/BAM files into one BAM overlapping chr:start-stop interval .\n"; - - @Option(shortName="I", doc="List of input BAM files") - public File INPUT_LIST; - - @Option(shortName="O", doc="SAM or BAM file to write merged result to") - public File OUTPUT; - - @Option(shortName="L", doc="Location to include") - public String SLICE; - - private static final int PROGRESS_INTERVAL = 1000000; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new SliceBams().instanceMain(argv)); - } - - private List parseInputFiles(File list) { - try { - final List files = new ArrayList(); - for (String fileName : new XReadLines(list).readLines() ) { - files.add(new File(fileName)); - } - return files; - } catch ( FileNotFoundException e ) { - throw new PicardException("Couldn't read input list", e); - } - } - - /** - * Walk over the input files, reading the headers, and finally prepare the output - * BAM containing a merge of all of the headers. - * - * @param inputBAMs - * @return - */ - private SAMFileWriter createOutputBAM(List inputBAMs) { - Collection headers = new ArrayList(); - - log.info("Reading headers"); - int fileCounter = 1; - for (final File inFile : inputBAMs) { - IoUtil.assertFileIsReadable(inFile); - final SAMFileReader inReader = new SAMFileReader(inFile, null); // null because we don't want it to look for the index - final SAMFileHeader inHeader = inReader.getFileHeader(); - log.info(" Reading header from file " + inFile + " " + fileCounter++ + " of " + inputBAMs.size()); - headers.add(inHeader); - inReader.close(); - } - - final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); - SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(headerMerger.getMergedHeader(), false, OUTPUT); - return new SimplifyingSAMFileWriter(out); - } - - /** Combines multiple SAM/BAM files into one. */ - @Override - protected int doWork() { - SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT); - SAMFileWriterFactory.setDefaultCreateIndexWhileWriting(true); - - // Open the files for reading and writing - List inputBAMs = parseInputFiles(INPUT_LIST); - IoUtil.assertFileIsWritable(OUTPUT); - final SAMFileWriter out = createOutputBAM(inputBAMs); - GenomeLocParser glParser = new GenomeLocParser(out.getFileHeader().getSequenceDictionary()); - GenomeLoc loc = glParser.parseGenomeLoc(SLICE); - - log.info("Reading BAM records"); - long numRecords = 1; - int fileCounter = 1; - for (final File inFile : inputBAMs) { - IoUtil.assertFileIsReadable(inFile); - log.info(" Reading file " + inFile + " " + fileCounter++ + " of " + inputBAMs.size()); - final SAMFileReader reader = new SAMFileReader(inFile); - SAMRecordIterator iterator = reader.queryOverlapping(loc.getContig(), loc.getStart(), loc.getStop()); - - while ( iterator.hasNext() ) { - final SAMRecord record = iterator.next(); - out.addAlignment(record); - if (numRecords % PROGRESS_INTERVAL == 0) { - log.info(numRecords + " records read."); - } - } - - reader.close(); - } - - log.info("Finished reading inputs."); - log.info("Sorting final output file."); - out.close(); - return 0; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/tools/SortROD.java b/java/src/org/broadinstitute/sting/playground/tools/SortROD.java deleted file mode 100644 index 0a659c010..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/SortROD.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.tools; - -import org.apache.log4j.BasicConfigurator; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.utils.codecs.completegenomics.CGVarCodec; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.utils.codecs.soapsnp.SoapSNPCodec; -import org.broad.tribble.gelitext.GeliTextCodec; -import org.broad.tribble.dbsnp.DbSNPCodec; -import org.broad.tribble.bed.BEDCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.playground.gatk.features.maf.MafCodec; - -import java.io.*; -import java.util.*; - -import net.sf.samtools.util.SortingCollection; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jan 28, 2011 - * Time: 12:15:03 PM - * To change this template use File | Settings | File Templates. - */ -public class SortROD { - // setup the logging system, used by some codecs - private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); - - /** - * this class: - * 1) checks to see that the feature file exists - * 2) loads an index from disk, if one doesn't exist, it creates it and writes it to disk - * 3) creates a FeatureSource - * 4) iterates over the records, emitting a final tally for the number of features seen - * - * @param args a single parameter, the file name to load - */ - public static void main(String[] args) { - BasicConfigurator.configure(); - logger.setLevel(org.apache.log4j.Level.INFO); - // check yourself before you wreck yourself - we require one arg, the input file - if (args.length != 3 ) - printUsage(); - - - String refarg = args[0]; - if ( ! refarg.endsWith(".fasta")) { - System.err.println("Reference file name must end with .fasta"); - System.exit(1); - } - - File refFile = new File(refarg); - if ( ! refFile.exists() ) { - System.err.println("Reference file "+refarg+" does not exist"); - System.exit(1); - } - - String rodType = null; - String inputArg; - // our feature file - int pos = args[1].indexOf(":"); - if ( pos == -1 ) { - inputArg = args[1]; - } else { - rodType = args[1].substring(0,pos); - inputArg = args[1].substring(pos+1); - } - File featureFile = new File(inputArg); - if (!featureFile.exists()) { - System.err.println("File " + featureFile.getAbsolutePath() + " doesnt' exist"); - printUsage(); - } - - BufferedWriter out = null; - try { - out = new BufferedWriter(new FileWriter(args[2])); - } catch ( IOException e ) { - System.err.println("Can not open output file "+args[2]+" for writing"); - System.exit(1); - } - - // determine the codec - FeatureCodec featureCodec = getFeatureCodec(featureFile,rodType); - ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); - - AsciiLineReader reader = null; - try { - reader = new AsciiLineReader(new FileInputStream(featureFile)); - } catch (FileNotFoundException e) { - System.err.println("File "+featureFile.getAbsolutePath()+" doesn't exist"); - System.exit(1); - } - - // read the headers - featureCodec.readHeader(reader); - - GenomeLocParser parser = new GenomeLocParser(ref.getSequenceDictionary()); - - SortingCollection sorter = SortingCollection.newInstance(String.class, - new LineCodec(), - new FeatureComparator(featureCodec,parser),200000); - - int nLines = 0; - try { - String currentLine = reader.readLine(); - while ( currentLine != null ) { - nLines++; - - // uncomment if null returns should be ignored - //if ( featureCodec.decodeLoc(currentLine) != null ) - sorter.add(currentLine); - - currentLine = reader.readLine(); - } - - for ( String s : sorter ) { - out.write(s); - out.write('\n'); - } - out.close(); - } catch (IOException e) { - System.err.println("Writing failed to the output file "+args[2]); - System.exit(1); - } - - logger.info("Sorting finished. Processed lines: "+nLines); - // runWithIndex(featureFile, codec, optimizeIndex); - - } - - - /** - * print usage information - */ - public static void printUsage() { - System.err.println("Usage: java -jar SortROD.jar [:] "); - System.err.println(" Where input can be of type: VCF (ends in .vcf or .VCF)"); - System.err.println(" Bed (ends in .bed or .bed)"); - System.err.println(" DbSNP (ends in .snp or .rod)"); - System.err.println(" MAF (ends in .maf)"); - System.err.println(" If input file has non-standard extension, rodType can be specified"); - System.err.println(" (rodType always takes precedence over file extension, even if the"); - System.err.println(" latter is otherwise recognizable). rodType can be vcf, bed, dbsnp, or maf"); - System.err.println(" Reference is what the input file needs to be sorted against"); - - /** - * you could add others here; also look in the GATK code-base for an example of a dynamic way - * to load Tribble codecs. - */ - System.exit(1); - } - - - public static FeatureCodec getFeatureCodec(File featureFile, String rodType) { - // quickly determine the codec type - if ( rodType != null ) { - if (rodType.equals("vcf") ) return new VCFCodec(); - if (rodType.equals("bed") ) return new BEDCodec(); - if (rodType.equals("cgvar") || rodType.equals("CGVar") ) return new CGVarCodec(); - if (rodType.equals("snp") || rodType.equals("dbsnp") ) return new DbSNPCodec(); - if (rodType.equals("geli.calls") || rodType.equals("geli") ) return new GeliTextCodec(); - if (rodType.equals("txt") ) return new SoapSNPCodec(); - if (rodType.equals("maf") ) return new MafCodec(); - throw new StingException("Explicitly specified rod type "+rodType+" is not recognized"); - } - if ( featureFile.getName().endsWith(".vcf") || featureFile.getName().endsWith(".VCF") ) - return new VCFCodec(); - if (featureFile.getName().endsWith(".bed") || featureFile.getName().endsWith(".BED") ) - return new BEDCodec(); - if ( featureFile.getName().endsWith(".tsv") || featureFile.getName().endsWith(".TSV") ) - return new CGVarCodec(); - if (featureFile.getName().endsWith(".snp") || featureFile.getName().endsWith(".rod") ) - return new DbSNPCodec(); - if (featureFile.getName().endsWith(".geli.calls") || featureFile.getName().endsWith(".geli") ) - return new GeliTextCodec(); - if (featureFile.getName().endsWith(".txt") || featureFile.getName().endsWith(".TXT") ) - return new SoapSNPCodec(); - if (featureFile.getName().endsWith(".maf") || featureFile.getName().endsWith(".MAF") ) - return new MafCodec(); - throw new IllegalArgumentException("Unable to determine correct file type based on the file name, for file -> " + featureFile); - } - - static class LineCodec implements SortingCollection.Codec { - OutputStream os; - InputStream is; - - public void setOutputStream(OutputStream outputStream) { - os = outputStream; - } - - public void setInputStream(InputStream inputStream) { - is = inputStream; - } - - public void encode(String s) { - try { - os.write(s.getBytes()); - os.write('\n'); - } catch (IOException e) { - throw new StingException("SortingCollection: Write into temporary file failed",e); - } - } - - public String decode() { - List l = new ArrayList(1024); - try { - int c = is.read(); - while ( c != -1 && c != '\n' ) { - l.add((byte)c); - c = is.read(); - } - } catch (IOException e) { - throw new StingException("SortingCollection: Read from temporary file failed",e); - } - return new String(toByteArray(l)); //To change body of implemented methods use File | Settings | File Templates. - } - - public SortingCollection.Codec clone() { - LineCodec codec = new LineCodec(); - codec.setInputStream(is); - codec.setOutputStream(os); - return codec; //To change body of implemented methods use File | Settings | File Templates. - } - - private byte [] toByteArray(List l) { - byte[] ret = new byte[l.size()]; - for ( int i = 0 ; i < l.size() ; i++ ) ret[i] = l.get(i); - return ret; - } - } - - static class FeatureComparator implements Comparator { - - FeatureCodec codec ; - GenomeLocParser parser; - - public FeatureComparator (FeatureCodec codec, GenomeLocParser parser) { - this.codec = codec; - this.parser=parser; - } - - /** - * Compares its two arguments for order. Returns a negative integer, - * zero, or a positive integer as the first argument is less than, equal - * to, or greater than the second.

- *

- * In the foregoing description, the notation - * sgn(expression) designates the mathematical - * signum function, which is defined to return one of -1, - * 0, or 1 according to whether the value of - * expression is negative, zero or positive.

- *

- * The implementor must ensure that sgn(compare(x, y)) == - * -sgn(compare(y, x)) for all x and y. (This - * implies that compare(x, y) must throw an exception if and only - * if compare(y, x) throws an exception.)

- *

- * The implementor must also ensure that the relation is transitive: - * ((compare(x, y)>0) && (compare(y, z)>0)) implies - * compare(x, z)>0.

- *

- * Finally, the implementor must ensure that compare(x, y)==0 - * implies that sgn(compare(x, z))==sgn(compare(y, z)) for all - * z.

- *

- * It is generally the case, but not strictly required that - * (compare(x, y)==0) == (x.equals(y)). Generally speaking, - * any comparator that violates this condition should clearly indicate - * this fact. The recommended language is "Note: this comparator - * imposes orderings that are inconsistent with equals." - * - * @param o1 the first object to be compared. - * @param o2 the second object to be compared. - * @return a negative integer, zero, or a positive integer as the - * first argument is less than, equal to, or greater than the - * second. - * @throws ClassCastException if the arguments' types prevent them from - * being compared by this comparator. - */ - public int compare(String o1, String o2) { - Feature f1 = codec.decodeLoc(o1); - Feature f2 = codec.decodeLoc(o2); - if ( f1 == null ) { - if ( f2 == null ) return 0; - else return -1; // null is less than non-null, this will hopefully push header strings up (but commented out lines will move up too!) - } - // f1 is not null - if ( f2 == null ) return 1; - - GenomeLoc l1 = parser.createGenomeLoc(f1.getChr(),f1.getStart(),f1.getEnd()); - GenomeLoc l2 = parser.createGenomeLoc(f2.getChr(),f2.getStart(),f2.getEnd()); - return l1.compareTo(l2); //To change body of implemented methods use File | Settings | File Templates. - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/tools/SplitReads.java b/java/src/org/broadinstitute/sting/playground/tools/SplitReads.java deleted file mode 100644 index 37cf040b8..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/SplitReads.java +++ /dev/null @@ -1,203 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.io.File; - -import net.sf.samtools.*; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.cmdline.Option; - -public class SplitReads extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Splits reads: extracts sub-sequences of the specified length(s) from left "+ - "and/or right ends of all the reads into the specified output bam file(s). For the reads in the input that are mapped, "+ - "the subsequences in the output bam(s) will have appropriately adjusted alignment positions and chopped cigars."; - @Option(shortName="I", - doc="Input file (bam or sam) with read sequences to split.", - optional=false) - public File IN = null; - @Option(shortName="E", doc="Read end to select, 1=left, 2=right; default: select both ends.", - optional=true) public List READ_ENDS = new ArrayList(); - @Option(shortName="N", doc="Number of bases to keep in the corresponding segment of the read. "+ - "Synchronized with READ_ENDS argument; if single number is given, all selected segments (ends) will have specified length.", - optional=false) public List LENGTH = new ArrayList(); - @Option(shortName="S", doc="Read name for each segment (read end) will be set as original read name followed by the corresponding suffix." + - "Synchronized with READ_ENDS argument and must have the same number of entries if specified (note that default READ_ENDS is a list of (1,2). "+ - "By default, suffixes are empty strings, i.e. all segments have the same name(s) as the original read." , optional=true) public List SUFFIXES = new ArrayList(); - @Option(shortName="O",optional=false, doc="Each read end will be sent into the corresponding file " + - "(synchronized with READ_ENDS). If only one file name is specified, all read segments will be printed into that file." - ) public List OUTPUT_BAMS = new ArrayList(); - @Option(shortName="U", doc="Split and output only unmapped reads; mapped reads will be ignored.", - optional=true) public boolean UNMAPPED = false; - - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new SplitReads().instanceMain(argv)); - } - - protected int doWork() { - - // if read ends are not specified explicitly on the cmd line, set default 1,2 (both ends) - if ( READ_ENDS.size() == 0 ) { - READ_ENDS.add(1); - READ_ENDS.add(2); - } - - for ( Integer i : READ_ENDS) { - if ( ! i.equals(1) && ! i.equals(2)) throw new RuntimeException("Unknown value specified for READ_ENDS: "+i); - } - - // if suffixes are not specified, set them to "", "" - if ( SUFFIXES.size() == 0 ) { - for ( Integer i : READ_ENDS) { - SUFFIXES.add( "" ); - } - } else { - // or make sure that the number of suffixes matches the number of ends - if ( SUFFIXES.size() != READ_ENDS.size() ) throw new RuntimeException("Number of suffixes specified must be equal to the number of read ends requested."+ - "Passed: "+ READ_ENDS.size() +" READ_ENDS and " + SUFFIXES.size() + " SUFFIXES arguments."); - } - - if ( LENGTH.size() == 1 ) { - // if only one length is specified, apply it to all ends: - LENGTH = Collections.nCopies(READ_ENDS.size(), LENGTH.get(0)); - } - - if ( LENGTH.size() != READ_ENDS.size() ) throw new RuntimeException("Number of lengths specified must be equal to the number of read ends requested."+ - "Passed: "+ READ_ENDS.size() +" READ_ENDS and " + LENGTH.size() + " LENGTH arguments."); - - if ( READ_ENDS.size() != OUTPUT_BAMS.size() && OUTPUT_BAMS.size() != 1 ) - throw new RuntimeException("Number of output files must be either one, or equal to the number of read ends requested."+ - "Passed: "+ READ_ENDS.size() +" READ_ENDS and " + OUTPUT_BAMS.size() + " OUTPUT_BAMS arguments."); - - SAMFileReader inReader = new SAMFileReader(IN); - - List outWriters = new ArrayList(OUTPUT_BAMS.size()); - for ( File outName : OUTPUT_BAMS ) { - outWriters.add(new SAMFileWriterFactory().makeSAMOrBAMWriter(inReader.getFileHeader(), true, outName)) ; - } - - - for ( SAMRecord read : inReader ) { - - if ( UNMAPPED && ! read.getReadUnmappedFlag() ) continue; - - for ( int i = 0 ; i < READ_ENDS.size(); i++ ) { - - SAMRecord newRecord = null; - try { - newRecord = (SAMRecord)read.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException("Clone not supported by SAMRecord implementation"); - } - - final int whichEnd = READ_ENDS.get(i); - final int length = LENGTH.get(i); - String name = read.getReadName(); - if ( length > read.getReadLength() ) throw new RuntimeException("Read "+name+" is shorter than the specified length ("+read.getReadLength()+"<"+length+")"); - int start = 0 , stop = 0; // [start, stop) : segment of the read to be selected; coordinates are wrt read sequence; half-open 0 based - switch ( whichEnd ) { - case 1: start = 0 ; stop = start + LENGTH.get(i); break; - case 2: stop = read.getReadLength() ; start = stop - LENGTH.get(i); break; - } - - newRecord.setReadBases(Arrays.copyOfRange(read.getReadBases(),start,stop)); - newRecord.setBaseQualities(Arrays.copyOfRange(read.getBaseQualities(), start, stop)); - newRecord.setReadName(name+ SUFFIXES.get(i)); - if ( read.getReadUnmappedFlag() ) { - //newRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - } else { - newRecord.setAlignmentStart(read.getAlignmentStart()+start); - newRecord.setCigar( chopCigar(read.getCigar(), start, length )); - } - - if ( outWriters.size() > 1 ) outWriters.get(i).addAlignment(newRecord); - else outWriters.get(0).addAlignment(newRecord); - } - - } - - inReader.close(); - for ( SAMFileWriter w : outWriters ) w.close(); - - return 0; - } - - - /** - * Returns new cigar representing segment of the alignment that starts at position start (0-based) - * with respect to the start of the original cigar and covers length bases on the original read the - * origCigar corresponds to (i.e. I elements count, but D do not). - * @param origCigar - * @param start - * @param length - * @return - */ - private Cigar chopCigar( Cigar origCigar, int start, int length ) { - - int elementEnd = 0; // next base after the end of the current cigar element on the read - - Cigar newCigar = new Cigar(); - - Iterator elements = origCigar.getCigarElements().iterator(); - - if ( ! elements.hasNext() ) System.out.println("CIGAR HAS NO ELEMENTS!"); - - CigarElement ce = null; - - while ( elementEnd <= start ) { // if we did not reach the start of selected segment yet: -// System.out.println("INIT: start="+start+"; length="+length+"; elementEnd="+elementEnd); - ce = elements.next(); - switch ( ce.getOperator() ) { - case N: // - case D : // read misses bases wrt the ref, nothing to count on the read - break; - case I: - case M: - case S: - case H: // all these elements are real bases on the read. Skip them completely if - // 'start' is past them, or crop if it is inside: - elementEnd += ce.getLength(); // 1 base past end of the current element on the read - - } - } - // at this point we are guaranteed that ce is the element that contains 'start' position; - // now we start adding cigar elements: - - // add manually first element, since we need only a part of it after 'start': - newCigar.add( new CigarElement(Math.min(elementEnd-start, length), ce.getOperator()) ); - - int selectionEnd = start + length; -// System.out.println(origCigar.toString()+": start="+start+"; length="+length+"; selectionEnd="+selectionEnd+"; elementEnd="+elementEnd); - while ( elementEnd < selectionEnd ) { - ce = elements.next(); - switch ( ce.getOperator() ) { - case N: // - case D : // read misses bases wrt the ref, nothing to count on the read, but the element has to be added: - newCigar.add( new CigarElement(ce.getLength(), ce.getOperator()) ); - break; - case I: - case M: - case S: - case H: // all these elements are real bases on the read. Add them and count them - // making sure that the last element gets cropped if needed: - elementEnd += ce.getLength(); // 1 base past end of the current element on the read - if ( elementEnd > selectionEnd ) { // this is the last element we have to consider and it needs to be cropped: - newCigar.add( new CigarElement(ce.getLength() - elementEnd + selectionEnd , ce.getOperator()) ); - } else { - newCigar.add( new CigarElement(ce.getLength(), ce.getOperator()) ); - } - } - - } - return newCigar; - - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/tools/VcfToGeliText.java b/java/src/org/broadinstitute/sting/playground/tools/VcfToGeliText.java deleted file mode 100755 index c9120e6b4..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/VcfToGeliText.java +++ /dev/null @@ -1,93 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.cmdline.Option; - -import java.io.*; -import java.util.*; - -public class VcfToGeliText extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Converts VCF files to simple Geli text files."; - @Option(shortName="I", doc="Input file (vcf) to convert.", optional=false) public File IN = null; - @Option(shortName="O",doc="Output file (gelitext). If not specified, output is printed to stdout.", optional=true) public File OUT = null; - @Option(shortName="sample",doc="Sample number to extract from the VCF. If not specified, it takes the firt one.", optional=true) public Integer sample = 1; - - public static void main(final String[] argv) { - System.exit(new VcfToGeliText().instanceMain(argv)); - } - - protected int doWork() { - - if ( IN == null ) - throw new RuntimeException("No input VCF file provided!"); - - FileReader in; - BufferedReader vcf; - try { - in = new FileReader(IN); - vcf = new BufferedReader(in); - } catch ( FileNotFoundException ie ) { - System.out.println("Failed to open input file "+IN+": "+ie.getCause()); - return 1; - } - - PrintStream out; - if ( OUT == null ) out = System.out; - else { - try { - out = new PrintStream(OUT); - } catch ( FileNotFoundException ie ) { - System.out.println("Failed to open output file "+OUT+": "+ie.getCause()); - return 1; - } - } - - String currentline; - try { - while ( (currentline = vcf.readLine()) != null ) { - if ( currentline.length() == 0 || currentline.charAt(0) == '#' ) - continue; - - StringTokenizer st = new StringTokenizer(currentline); - String chr = st.nextToken(); - String pos = st.nextToken(); - st.nextToken(); - String ref = st.nextToken(); - String altStr = st.nextToken(); - for (int i = 0; i < 4; i++) - st.nextToken(); - for (int i = 1; i < sample; i++) - st.nextToken(); - String sampleStr = st.nextToken(); - - HashMap alleles = new HashMap(); - alleles.put("0", ref); - StringTokenizer stAlt = new StringTokenizer(altStr, ","); - int index = 1; - while ( stAlt.hasMoreTokens() ) - alleles.put(String.valueOf(index++), stAlt.nextToken()); - - StringTokenizer stSample = new StringTokenizer(sampleStr, "/:"); - String genotype1 = stSample.nextToken(); - String genotype2 = stSample.nextToken(); - if ( genotype1.equals("0") && genotype2.equals("0") ) - continue; - - out.println(chr + "\t" + pos + "\t" + ref + "\t0\t0\t" - + alleles.get(genotype1) + alleles.get(genotype2) - + "\t30\t30\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0"); - } - vcf.close(); - in.close(); - out.close(); - } catch (IOException e) { - System.out.println("Jaa I/O exception"+e.getCause()); - return 1; - } - - return 0; - } - - -} diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java b/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java deleted file mode 100644 index 9c771f696..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.utils; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import net.sf.samtools.*; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.collections.Pair; - -public class GenomicMap implements Iterable > >{ - - private Map > map; - - /** Creates new empty genomic map preallocated to handle initialContig contigs. - * - * @param initialContigs - */ - public GenomicMap(int initialContigs) { - map = new HashMap >(initialContigs); - } - - /** Creates new empty genomic map */ - public GenomicMap() { - this(1000); - } - - /** Adds custom contig to the map, as a collection of intervals on the master reference. - * - * @param name name of the custom contig; can not be null - * @param c mapping of the custom contig sequence onto intervals on the master reference - */ - public void addCustomContig(String name, Collection c) { - if ( name == null ) throw new ReviewedStingException("Custom contig name can not be null"); - if ( map.containsKey(name)) throw new ReviewedStingException("Custom contig "+name+" already exists"); - map.put(name, c); - } - - /** Returns mapping of the specified custom contig onto the custom reference. \ - * If no such contig exists, returns null. - * - * @param name - * @return - */ - public Collection getContigMapping(String name) { return map.get(name); } - - /** Read genomic map from specified Arachne multimap file. Format: - * contig_id start stop contig_id start stop ... # name ... - * where start, stop are 0 based, closed intervals - * @param f - */ - public void readArachne(SAMSequenceDictionary sequenceDictionary,GenomeLocParser genomeLocParser,File f) { - - try { - BufferedReader reader = new BufferedReader( new FileReader(f) ); - - String line = null; - while( ( line = reader.readLine() ) != null ) { - String[] halves = line.split("#",2); - if ( halves.length < 2 ) - throw new UserException.MalformedFile(f, "Line: "+line+"\nin map file "+f+"\n does not contain contig name"); - - int p1 = 0; - for ( ; p1 < halves[1].length() && Character.isWhitespace(halves[1].charAt(p1) ); p1++ ); - // p1 is now index of first non-space - int p2 = p1; - for ( ; p2 < halves[1].length() && ! Character.isWhitespace(halves[1].charAt(p2) ); p2++ ); - // p2 is index of first whitespace after first word - - if ( p1 == p2 ) - throw new UserException.MalformedFile(f, "Line: "+line+"\n in map file "+f+"\nNo contig name found after '#'"); - - String name = halves[1].substring(p1, p2); - - String[] coord_parts = halves[0].split("\\s"); - if ( coord_parts.length % 3 != 0 ) - throw new UserException.MalformedFile(f, "Line: "+line+"\n in map file "+f+"\nNumber of coordinate fields is not a multiple of 3"); - - List segments = new ArrayList( coord_parts.length / 3 ); - - for ( int i = 0 ; i < coord_parts.length ; i += 3 ) { - // Arachne map file contains 0-based, closed intervals, hence +1 below. - int index = Integer.parseInt(coord_parts[i]); - String contig = sequenceDictionary.getSequence(index).getSequenceName(); - int start = Integer.parseInt(coord_parts[i+1]); - int stop = Integer.parseInt(coord_parts[i+2]); - segments.add(genomeLocParser.createGenomeLoc(contig, start+1, stop+1)); - } - - addCustomContig(name, segments); - - } - reader.close(); - } catch ( FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(f, e); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(f, e); - } - } - - /** Read genomic map from specified file in "new" format. Format: - * name chr:start-stop,chr:start-stop,...,chr:start-stop - * where start, stop are 1 based, closed intervals - * @param f - */ - public void read(GenomeLocParser genomeLocParser,File f) { - - try { - BufferedReader reader = new BufferedReader( new FileReader(f) ); - - String line = null; - while( ( line = reader.readLine() ) != null ) { - int p1 = 0; - while ( p1 < line.length() && Character.isWhitespace(line.charAt(p1))) p1++; - int p2 = p1; - while ( p2 < line.length() && ! Character.isWhitespace(line.charAt(p2))) p2++; - if ( p1 == p2 ) continue; // empty line - - String name = line.substring(p1, p2); - - List segments = new ArrayList( 5 ); - - p1 = p2+1; // set p1 after first whitespace after the name - while ( p1 < line.length() && Character.isWhitespace(line.charAt(p1))) p1++; // skip whitespaces - p2 = p1; - while ( p2 < line.length() && line.charAt(p2) != ',') p2++; // next comma or end-of-line - - while ( p2 != p1 ) { - GenomeLoc newSegment = genomeLocParser.parseGenomeLoc(line.substring(p1, p2)); - if ( segments.size() > 0 && - segments.get(segments.size()-1).getStop()+1 == newSegment.getStart() && - segments.get(segments.size()-1).getContigIndex() == newSegment.getContigIndex()) - System.out.println("WARNING: strictly adjacent segments found in custom contig "+name); - - segments.add(newSegment); - - p1 = p2+1; // set p1 after the comma - while ( p1 < line.length() && Character.isWhitespace(line.charAt(p1))) p1++; // skip whitespaces - p2 = p1; - while ( p2 < line.length() && line.charAt(p2) != ',') p2++; // next comma or end-of-line - } - if ( segments.size() == 0 ) throw new ReviewedStingException("Line "+line+" has no intervals specified"); - addCustomContig(name, segments); - } - reader.close(); - } catch ( FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(f, e); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(f, e); - } - } - - public void write(File f) { - try { - BufferedWriter writer = new BufferedWriter( new FileWriter( f )); - for ( String name : nameSet() ) { - writer.append(name+" "); - Iterator iter = getContigMapping(name).iterator(); - if ( iter.hasNext() ) writer.append(iter.next().toString()); - while (iter.hasNext()) { - writer.append(','); - writer.append(iter.next().toString()); - } - writer.append('\n'); - } - writer.close(); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(f, e); - } - } - - /** Remaps a record (read) aligned to a custom contig back onto the master reference. - * If the map does not have mapping information for - * the contig, an exception will be thrown. This method changes read's reference name, start position and - * cigar, as well as the read's file header (must be provided). - * - * Some aligners (e.g. bwa) can return "alignments" spanning across contig boundaries. The last argument of this - * method controls the behavior in this case: if it is set to true, such alignments are ignored upon detection, - * and the method returns null. Otherwise, strict validation mode is used: if aligned read extends beyond the - * contig boundary, an exception is thrown. - * - * @param r read, alignment information (contig, start position, cigar) will be modified by this method - * @param h SAM file header for the master reference the alignment is being mapped onto; will be substituted for the read's header. - * @return same read instance that was passed to this method, remapped - */ - public SAMRecord remapToMasterReference(SAMRecord r, SAMFileHeader h, boolean discardCrossContig) { - if ( AlignmentUtils.isReadUnmapped(r) ) { - // set to NO_... just in case: in principle, SAM format spec allows unmapped reads (with 'unmapped' - // flag raised) to have reference contig and start position set to arbitrary values for sorting - // purposes; after remapping, these values would make no sense or even cause a crash when reading - // remapped bam - r.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - r.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - - // these are required because current santools jdk is over-validating and requiring MAPQ, CIGAR etc be - // set to 0/null for unmapped reads. In principle, it should not matter. - r.setMappingQuality(0); - r.setCigar(new Cigar()); - r.setNotPrimaryAlignmentFlag(false); - if ( r.getReadNegativeStrandFlag() ) { - r.setReadBases(BaseUtils.simpleReverseComplement(r.getReadBases())); - r.setBaseQualities(Utils.reverse(r.getBaseQualities())); - r.setReadNegativeStrandFlag(false); - } - return r; // nothing to do if read is unmapped - } - - int customStart = r.getAlignmentStart(); - - // get mapping from read's contig onto a "global" contig (as a list of intervals on the latter): - Collection segments = getContigMapping(r.getReferenceName()); - if ( segments == null ) throw new UserException.MalformedBAM(r, "Can not remap a record: unknown custom contig name "+r.getReferenceName()); - - // scroll the list of intervals until we find the interval that the alignment start falls into: - Pair, Integer> p = seekForward(segments,customStart); - - Iterator iter = p.first; - - GenomeLoc gl = iter.next(); // initialization: get interval that contains customStart - - // p.second is 1-based position of alignment start relative to interval gl; - // hence refPos is 1-based position of the alignment start on the master ref (since gl.getStart() is 1-based too) - int refPos = (int)(p.second+gl.getStart()-1); - - String oldRefName = r.getReferenceName(); - int oldStart = r.getAlignmentStart(); - int oldEnd = r.getAlignmentEnd(); - - r.setAlignmentStart(refPos); - - r.setHeader(h); // have to substitute here, or setReferenceIndex will not work correctly below - - r.setReferenceIndex(gl.getContigIndex()); - - Cigar oldCigar = r.getCigar(); - Cigar newCigar = new Cigar(); - int N = oldCigar.numCigarElements() ; - - long currStop = gl.getStop();// end of the current segment of the custom contig on the master reference, 1-based inclusive - int delayedGap = 0 ; // length of the 'N' gap between the segments (intervals) on the master ref, to be added only if followed by another cigar element - for ( int k = 0; k < N ; k++ ) { - CigarElement ce = oldCigar.getCigarElement(k); - int len = ce.getLength(); // length of the cigar element - switch( ce.getOperator() ) { - case S: // soft clip - case H: // or hard clip - these are not included in getAlignmentStart, so pass them through - if ( k != 0 && k != N-1 ) // paranoid - throw new ReviewedStingException("Don't know what to do with S or N cigar element that is not at the either end of the cigar. Cigar: "+ - r.getCigarString()); - case I: // insertions are passed through as well - newCigar.add(new CigarElement(len,ce.getOperator())); - break; - case D: - case M: -/////////// - if ( delayedGap > 0 ) { - // we get here if previous M or D element ended exactly at the interval boundary; we need - // to add the stretch of N's only if that element turned out to be not the last one, so we do it now - newCigar.add(new CigarElement(delayedGap, CigarOperator.N)); - delayedGap = 0; - } - while ( refPos + len - 1 > currStop ) { // current D or M cigar element extends beyond the end of current segment - - // we have that many bases in the current cigar element till the end of the current segment: - int currLength = (int)(currStop-refPos+1); - - - // curr length can be exactly 0 if previous element ended exactly at the segment boundary: - // after that element was processed, refPos was set to currStop+1, so in this special case we need - // *first* to switch to next segment, *then* start adding bases from the current element. - if ( currLength > 0 ) { - newCigar.add(new CigarElement( currLength,ce.getOperator()) ); // record deletion/match till the end of the current segment - len -= currLength; // we still have 'len' bases remaining in the current cigar element - } - - // NOTE: since we entered the loop, we were guaranteed that len > currLength, so now len > 0 - - // check if we have next segment to extend remaining matching bases to; if we don't, something's awfully wrong: - if ( ! iter.hasNext() ) { - String message = "Record "+r.getReadName()+" extends beyond its custom contig."+ - "\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+ - r.getCigarString()+"; contig length="+contigLength(segments); - if ( discardCrossContig ) { - // System.out.println("WARNING: ALIGNMENT DISCARDED: "+message); - return null; - } else throw new UserException.MalformedBAM(r, message); - } - - gl = iter.next(); // advance to next segment - - refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref - - if ( gl.getContigIndex() != r.getReferenceIndex() ) - throw new UserException.MalformedBAM(r, "Contig "+oldRefName+ - " has segments on different master contigs: currently unsupported"); - - if ( refPos < currStop + 1 ) - throw new UserException.MalformedBAM(r, "Contig "+oldRefName+ - " has segments that are out of order or strictly adjacent: currently unsupported"); - if ( len > 0 && refPos > currStop + 1 ) { - // add "panning" N's w/respect to the master ref over the region between adjacent segments - // (and do not add anything if segments are strictly adjacent, i.e. refPos == currStop+1): - newCigar.add(new CigarElement((int)(refPos-currStop-1),CigarOperator.N)); - } else { - // we jumped onto the next interval, but the current cigar element ended exactly - // at the end of the previous interval. We will need to end later, only if more M/D elements follow: - delayedGap = (int)(refPos-currStop-1); - } - currStop = gl.getStop(); - // now we can continue with recording remaining matching bases over the current segment - } - // we get here when remaining matching bases fit completely inside the current segment: - if ( len > 0 ) newCigar.add(new CigarElement(len,ce.getOperator())); - refPos+=len; - - break; -//////////// - } - } - - r.setCigar(newCigar); - - return r; - } - - public int size() { return map.size(); } - - public Iterator > > iterator() { return map.entrySet().iterator(); } - public Iterator nameIterator() { return map.keySet().iterator(); } - public Set nameSet() { return map.keySet(); } - - /** Returns an iterator into the specified collection of segments that points right before the segment that contains - * specified position, and the offset of the position inside that segment. This helper method assumes that - * there is a "custom" contig built of intervals on the "master" reference; the first argument specifies - * the mapping (i.e. an ordered collection of master reference intervals the custom contig is built of), and the second argument - * is the 1-based position on that custom contig. Returned iterator is advanced towards the interval (element of the passed - * collection) that contains the specified position, namely a call to next() on the returned iterator will return that interval. - * Returned integer offset is the 1-based offset of the base at position position on the custom contig with respect - * to the start of the interval that base. If position is outside of the custom contig, runtime StingException will be thrown. - * @param segments mapping of the custom contig onto the master reference - * @param position 1-based position on the custom contig - * @return - */ - private Pair,Integer> seekForward(Collection segments,int position) { - - if ( position < 1 ) throw new ReviewedStingException("Position "+position + " is outside of custom contig boundaries"); - - PushbackIterator iter = new PushbackIterator(segments.iterator()); - - while ( iter.hasNext() ) { - GenomeLoc current = iter.next(); - long length = current.getStop() - current.getStart() + 1; // length of current segment - if ( position <= length ) { // position is on the current segment - iter.pushback(current); - return new Pair, Integer >( iter,position); - } - // no, position is beyond the current segment; subtract the length of current segment and step to next one - position -= length; - } - // if we get here, position is to the right of the last segment; not good. - throw new ReviewedStingException("Position "+position + " is outside of custom contig boundaries"); - } - - private long contigLength(Collection segments) { - long l = 0; - for ( GenomeLoc g : segments ) l += (g.getStop() - g.getStart() + 1 ); - return l; - } - - public static void main(String argv[]) { - -// SAMFileReader reader = new SAMFileReader(new java.io.File("/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0904/normal.bam")); - SAMFileReader reader = new SAMFileReader(new java.io.File("X:/asivache/cDNA/new_pipeline/30BV1/test.1.sam")); - - - SAMRecord r = new SAMRecord(reader.getFileHeader()); - GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); - - r.setReferenceName("ENST00000378466"); - r.setAlignmentStart(1235); - r.setCigarString("24M1D27M"); - -// List s = new ArrayList(); -// s.add( GenomeLocParser.createGenomeLoc("chr1", 100, 199)); -// s.add( GenomeLocParser.createGenomeLoc("chr1", 300, 499)); -// s.add( GenomeLocParser.createGenomeLoc("chr1", 600, 799)); - - GenomicMap m = new GenomicMap(5); - -// m.readArachne(genomeLocParser,new File("/humgen/gsa-scr1/asivache/cDNA/Ensembl48.transcriptome.map")); -// m.write(new File("/humgen/gsa-scr1/asivache/cDNA/new_pipeline/Ensembl48.new.transcriptome.map")); - m.read(genomeLocParser,new File("W:/berger/cDNA_BAM/refs/Ensembl52.plus.Genome.map")); - - m.remapToMasterReference(r,reader.getFileHeader(),true); - -// if ( m.getContigMapping("ENST00000302418") == null ) System.out.println("ERROR! CONTIG IS MISSING!"); - - int cnt = 0; - - System.out.println(m.size() + " contigs loaded"); - System.out.println("new alignment: "+r.format()) ; -/* - for ( String name : m.nameSet() ) { - - System.out.print(name); - System.out.print(": "); - for ( GenomeLoc g : m.getContigMapping(name)) { - System.out.print(g.toString()+", "); - } - System.out.println(); - cnt ++; - if ( cnt > 10 ) break; - } -*/ -// m.addCustomContig("My", s); -/* - r.setReferenceName("My"); - r.setAlignmentStart(3); - r.setCigarString("5S97M5D197M5H"); - - m.remapToMasterReference(r); - System.out.println(r.getReferenceName()+":"+r.getAlignmentStart()+" "+r.getCigarString()); -*/ - reader.close(); - - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/utils/NamedTable.java b/java/src/org/broadinstitute/sting/playground/utils/NamedTable.java deleted file mode 100755 index f84466042..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/NamedTable.java +++ /dev/null @@ -1,292 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -import java.util.*; - -/** - * NamedTable is a utility class for maintaining a table and accessing rows and columns - * with named identifiers, rather than indicies that must be remembered. It also grows - * dynamically; you needn't specify the rows and columns before accessing them, so you can - * continuously expand the table in situations where you don't necessarily know how many - * rows or columns you'll have in the end. - */ -public class NamedTable { - // If in the future, this class gets templatized, the counter variable should really become a CountedObject. - private HashMap> table; - private HashSet rowNames; - private HashSet colNames; - - public NamedTable() { - table = new HashMap>(); - rowNames = new HashSet(); - colNames = new HashSet(); - } - - /** - * Copy another table into this new table - * @param ct the table to copy - */ - public NamedTable(NamedTable ct) { - table = new HashMap>(); - rowNames = new HashSet(); - colNames = new HashSet(); - - for (String rowName : ct.getRowNames()) { - for (String colName : ct.getColumnNames()) { - this.set(rowName, colName, ct.get(rowName, colName)); - } - } - } - - /** - * If the entry we're trying to access doesn't exist, create it. - * - * @param rowName the name of the row - * @param colName the name of the column - */ - private void verifyEntry(String rowName, String colName) { - rowNames.add(rowName); - colNames.add(colName); - - if (!table.containsKey(rowName)) { - table.put(rowName, new HashMap()); - } - - if (!table.get(rowName).containsKey(colName)) { - table.get(rowName).put(colName, 0.0); - } - } - - /** - * Set an entry in the table - * - * @param rowName the name of the row - * @param colName the name of the column - * @param value the value to set for the (row,column)-th entry - */ - public void set(String rowName, String colName, double value) { - verifyEntry(rowName, colName); - - table.get(rowName).put(colName, value); - } - - /** - * Get an entry in the table - * - * @param rowName the name of the row - * @param colName the name of the column - * @return the value of the (row,column)-th entry - */ - public double get(String rowName, String colName) { - verifyEntry(rowName, colName); - - return table.get(rowName).get(colName); - } - - /** - * For convenience, increment the (row,column)-th entry in the table so that the - * user doesn't need to extract, increment, and then reassign the value. One day, - * this should probably be rewritten to use Andrey's CountedObject class. - * - * @param rowName the name of the row - * @param colName the name of the column - */ - public void increment(String rowName, String colName) { - double value = get(rowName, colName); - - table.get(rowName).put(colName, value + 1.0); - } - - /** - * For convenience, decrement the (row,column)-th entry in the table so that the - * user doesn't need to extract, increment, and then reassign the value. One day, - * this should probably be rewritten to use Andrey's CountedObject class. - * - * @param rowName the name of the row - * @param colName the name of the column - */ - public void decrement(String rowName, String colName) { - double value = get(rowName, colName); - - table.get(rowName).put(colName, value - 1.0); - } - - /** - * Get a sorted list of all the rows in the table - * - * @return a sorted list of all the rows in the table - */ - public ArrayList getRowNames() { - ArrayList rows = new ArrayList(); - Iterator rowit = rowNames.iterator(); - while (rowit.hasNext()) { - rows.add(rowit.next()); - } - Collections.sort(rows); - - return rows; - } - - /** - * Get a sorted list of all the columns in the table - * - * @return a sorted list of all the columns in the table - */ - public ArrayList getColumnNames() { - ArrayList cols = new ArrayList(); - Iterator colit = colNames.iterator(); - while (colit.hasNext()) { - cols.add(colit.next()); - } - Collections.sort(cols); - - return cols; - } - - /** - * Get a new table representing a subset of the current table - * @param rowNames a list of rows to extract - * @param colNames a list of columns to extract - * @return the subsetted table - */ - public NamedTable getSubset(ArrayList rowNames, ArrayList colNames) { - NamedTable ct = new NamedTable(); - - for (String rowName : rowNames) { - for (String colName : colNames) { - ct.set(rowName, colName, get(rowName, colName)); - } - } - - return ct; - } - - /* This stuff doesn't belong in this class, but I don't want - to delete the code until it's moved somewhere appropriate */ - /* - public boolean twoTailedFisherExactTest(double pValueLimit) { - return (twoTailedFisherExactTest() < pValueLimit); - } - - public double oneTailedFisherExactTestRight() { - NamedTable ct = new NamedTable(this); - - double pCutoff = pValue(); - double pValue = pCutoff; - - while (ct.rotateRight()) { - double pValuePiece = ct.pValue(); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - return pValue; - } - - public double oneTailedFisherExactTestLeft() { - NamedTable ct = new NamedTable(this); - - double pCutoff = pValue(); - double pValue = pCutoff; - - while (ct.rotateLeft()) { - double pValuePiece = ct.pValue(); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - return pValue; - } - - public double twoTailedFisherExactTest() { - return oneTailedFisherExactTestLeft() + oneTailedFisherExactTestRight(); - } - - public double pValue() { - double p = 0.0; - - if (rowNames.size() == 2 && colNames.size() == 2) { - String[] rows = rowNames.toArray(new String[1]); - String[] columns = colNames.toArray(new String[1]); - - double a = get(rows[0], columns[0]); - double b = get(rows[0], columns[1]); - double c = get(rows[1], columns[0]); - double d = get(rows[1], columns[1]); - double n = a + b + c + d; - - double p1 = Arithmetic.binomial(a + b, (long) a); - double p2 = Arithmetic.binomial(c + d, (long) c); - double pn = Arithmetic.binomial(n, (long) (a + c)); - - p = p1*p2/pn; - } - - return p; - } - - public boolean rotateRight() { - String[] rows = rowNames.toArray(new String[1]); - String[] columns = colNames.toArray(new String[1]); - - decrement(rows[0], columns[0]); - increment(rows[1], columns[0]); - - increment(rows[0], columns[1]); - decrement(rows[1], columns[1]); - - return (get(rows[0], columns[0]) >= 0 && get(rows[1], columns[1]) >= 0); - } - - public boolean rotateLeft() { - String[] rows = rowNames.toArray(new String[1]); - String[] columns = colNames.toArray(new String[1]); - - increment(rows[0], columns[0]); - decrement(rows[1], columns[0]); - - decrement(rows[0], columns[1]); - increment(rows[1], columns[1]); - - return (get(rows[0], columns[1]) >= 0 && get(rows[1], columns[0]) >= 0); - } - */ - - /** - * Get a nicely-formatted version of the contents of the table. - * - * @return a String representing the contents of the table - */ - public String toString() { - String tableString = ""; - boolean headerPrinted = false; - - ArrayList rows = getRowNames(); - ArrayList cols = getColumnNames(); - - for (String rowName : rows) { - if (!headerPrinted) { - tableString += "rowName "; - for (String colName : cols) { - tableString += "\t" + colName; - } - tableString += "\n"; - - headerPrinted = true; - } - - tableString += rowName; - - for (String colName : cols) { - tableString += String.format("\t%7.7f", get(rowName, colName)); - } - - tableString += "\n"; - } - - return tableString; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/utils/ParallelSAMIterator.java b/java/src/org/broadinstitute/sting/playground/utils/ParallelSAMIterator.java deleted file mode 100644 index 2433b3f65..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/ParallelSAMIterator.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.utils; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.util.CloseableIterator; - -import java.util.List; -import java.util.ArrayList; - -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; - -/** - * Iterates synchronously over two SAM files. At each iteration returs alignments with the same read name (in the order - * the read names appear in the files). Alignment(s) from the first/second SAM file will be stored as the first/second - * element of the pair, respectively. Multiple alignments (alternative placements) for a given read are allowed in both - * input files. If only one of the files have alignment(s) for a given read name, the returned - * pair will contain an empty list in the element corresponding to the other file. To enable this sort of traversal - * synchronized by read names, the input SAM files must be sorted by read name. Constructor of this class verifies - * that this is the case: SAM file headers must report either 'queryname' sorting order, or no sorting order - * (to allow the code to work with not-fully compliant 3rd party tools that do not set header flags properly; a warning - * will be currently printed to stdout in this case); if sorting order in either of the files is set to "coordinate", - * an exception will be thrown. - */ - public class ParallelSAMIterator implements CloseableIterator< Pair< List, List > > { - private SAMFileReader reader1; - private SAMFileReader reader2; - PushbackIterator i1; - PushbackIterator i2; - List alignments1; - List alignments2; - - public ParallelSAMIterator(SAMFileReader r1, SAMFileReader r2) { - reader1 = r1; - reader2 = r2; - checkSortOrder(r1,"End 1"); - checkSortOrder(r2, "End 2"); - i1 = new PushbackIterator(r1.iterator()); - i2 = new PushbackIterator(r2.iterator()); - alignments1 = nextGroup(i1); // pre-read next set of alignments - alignments2 = nextGroup(i2); - } - - - /** - * Returns true if the iteration has more elements. (In other - * words, returns true if next would return an element - * rather than throwing an exception.) - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() { - return alignments1.size() > 0 || alignments2.size() > 0; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration. - * @throws java.util.NoSuchElementException - * iteration has no more elements. - */ - public Pair< List, List > next() { - Pair< List, List > result; - - if ( alignments1.size() == 0 ) { - // no more alignments left for end1 - result = new Pair< List, List >(alignments1,alignments2); - alignments2 = nextGroup(i2); - return result; - } - if ( alignments2.size() == 0 ) { - // no more alignments left for end2 - result = new Pair< List, List >(alignments1,alignments2); - alignments1 = nextGroup(i1); - return result; - } - // next group of alignments is held for both ends. Check the read names: - String end1Name = alignments1.get(0).getReadName(); - String end2Name = alignments2.get(0).getReadName(); - - int cmp = end1Name.compareTo(end2Name); - if ( cmp < 0 ) { - // end1 goes before end2; return end1 with empty list for corresponding end2 and read next end1 - result = new Pair< List, List >(alignments1,new ArrayList()); - alignments1 = nextGroup(i1); - } else { - if ( cmp > 0 ) { - // end2 goes before end1; return end2 with empty list for corresponding end1 and read next end2 - result = new Pair< List, List >(new ArrayList(),alignments2); - alignments2 = nextGroup(i2); - } else { - // end 1 and end2 have the same read name => we got a mate pair: - result = new Pair< List, List >(alignments1, alignments2); - alignments1 = nextGroup(i1); - alignments2 = nextGroup(i2); - } - } - return result; - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("ParallelSAMIterator does not support remove() operation."); - } - - public void close() { - reader1.close(); - reader2.close(); - } - - /** - * Read next alignment, and all immediately following ones that share same read name with the first; - * return them all as a list. - * @param i - * @return - */ - private List nextGroup(PushbackIterator i) { - List result = new ArrayList(); - String readName ; - - if ( ! i.hasNext() ) return result; // nothing left - SAMRecord r = i.next(); - readName = r.getReadName(); - result.add(r); - - while ( i.hasNext() ) { - r = i.next(); - if ( ! r.getReadName().equals(readName) ) { - i.pushback(r); - break; - } - result.add(r); - } - return result; - } - - /** - * Utility method: checks that the sorting order in the input file is right - * - * @param reader sam file reader - * @param fileName name of the file the reader is associated with. Used only to create more intelligible warning/exception messages, - * you can actually pass any string here. - */ - private void checkSortOrder(SAMFileReader reader, String fileName) { - - if ( reader.getFileHeader() == null ) { - System.out.println("WARNING: File "+fileName+" has no header. Assuming that file is sorted by read name."); - } - - switch ( reader.getFileHeader().getSortOrder() ) { - case coordinate: - throw new RuntimeException("File "+fileName+" is sorted by coordinate. Sort it by read name first."); - case unsorted: - System.out.println("WARNING: file "+fileName+" has sorting order tag set to 'unsorted'. "+ - "Assuming that it is sorted by read name."); - break; - case queryname: break; // good, that's what we need - default: throw new RuntimeException("File "+fileName + ": unknown sorting order ("+ - reader.getFileHeader().getSortOrder()+")"); - } - - } - - } diff --git a/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java b/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java deleted file mode 100644 index 781a6fc9c..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.utils; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.apache.log4j.Logger; - -/** - * A set of utilities for managing external processes. - */ -public class ProcessUtils { - private static Logger logger = Logger.getLogger(ProcessUtils.class); - - /** - * Runs a command line and returns the result code. - * @param command Command line to execute. - * @return Result code of the command. - */ - public static int runCommandAndWait(String command) { - try { - logger.debug("Running command: " + command); - - Process p = Runtime.getRuntime().exec(command); - int result = p.waitFor(); - - if (logger.isDebugEnabled()) { - for (String line : new XReadLines(p.getInputStream())) { - logger.debug("command: " + line); - } - for (String line : new XReadLines(p.getErrorStream())) { - logger.error("command: " + line); - } - } - - logger.debug("Command exited with result: " + result); - - return result; - } catch (Exception e) { - throw new ReviewedStingException("Error running command:" + command, e); - } - } -} diff --git a/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java b/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java deleted file mode 100644 index dafaf3ffe..000000000 --- a/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.alignment; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.WalkerTest; - -import java.util.Arrays; - -/** - * Integration tests for the aligner. - * - * @author mhanna - * @version 0.1 - */ -public class AlignerIntegrationTest extends WalkerTest { - @Test - public void testBasicAlignment() { - String md5 = "34eb4323742999d6d250a0aaa803c6d5"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + GATKDataLocation + "bwa/human_b36_both.fasta" + - " -T Align" + - " -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testBasicAlignment", spec); - } -} diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java deleted file mode 100644 index fa9d0afbb..000000000 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -/** - * check that we're getting the expected results from the RODs for reads for a variety of input types - */ -public class ValidateRODForReadsIntegrationTest extends WalkerTest { - - private final String vcfFile = validationDataLocation + "rodForReadsVCFCheck.vcf"; - private final String dbSNPFile = GATKDataLocation + "dbsnp_129_hg18.rod"; - - public static String baseTestString() { - return "-T ValidateRODForReads -o %s -R " + hg18Reference + " -I " + validationDataLocation + "small_bam_for_rods_for_reads.bam"; - } - - - @Test - public void testSimpleVCFPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:vcf,vcf3 " + vcfFile, 1, - Arrays.asList("f7919e9dc156fb5d3ad0541666864ea5")); - executeTest("testSimpleVCFPileup", spec); - } - - @Test - public void testSimpleDbSNPPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:dbsnp,dbsnp " + dbSNPFile, 1, - Arrays.asList("c63b8ef9291a450f0519c73ac9cae189")); - executeTest("testSimpleDbSNPPileup", spec); - } -} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java deleted file mode 100755 index 5b54d110c..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class HLACallerIntegrationTest extends WalkerTest { - - private static final String intervals = validationDataLocation + "HLA_EXONS.intervals"; - - - @Test - public void testFindClosestHLA() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T FindClosestHLA -I " + validationDataLocation + "NA12878.HISEQ.HLA.bam -R " + b36KGReference + " -L " + intervals + " -useInterval " + intervals + " -HLAdictionary " + validationDataLocation + "HLA_DICTIONARY.txt -PolymorphicSites " + validationDataLocation + "HLA_POLYMORPHIC_SITES.txt -o %s", 1, - Arrays.asList("a49b6f54a4585d1dd958c55a5523427d")); - executeTest("test FindClosestHLA", spec); - } - - @Test - public void testCalculateBaseLikelihoods() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CalculateBaseLikelihoods -I " + validationDataLocation + "NA12878.HISEQ.HLA.bam -R " + b36KGReference + " -L " + intervals + " -filter " + validationDataLocation + "HLA_HISEQ.filter -maxAllowedMismatches 6 -minRequiredMatches 0 -o %s", 1, - Arrays.asList("921bb354f3877e5183ca31815546b9fd")); - executeTest("test CalculateBaseLikelihoods", spec); - } - - @Test - public void testHLACaller() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T HLACaller -noVerbose -I " + validationDataLocation + "NA12878.HISEQ.HLA.bam -R " + b36KGReference + " -L " + intervals + " -useInterval " + intervals + " -HLAdictionary " + validationDataLocation + "HLA_DICTIONARY.txt -filter " + validationDataLocation + "HLA_HISEQ.filter -maxAllowedMismatches 6 -minRequiredMatches 5 -HLAfrequencies " + validationDataLocation + "HLA_FREQUENCIES.txt -bl " + validationDataLocation + "HLA_HISEQ.baselikelihoods -o %s", 1, - Arrays.asList("f9931b378bde213e71fca6ecaa24b48b")); - executeTest("test HLACaller", spec); - } -} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicatesWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicatesWalkersIntegrationTest.java deleted file mode 100755 index 12bda2af8..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicatesWalkersIntegrationTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.duplicates; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.List; - -public class DuplicatesWalkersIntegrationTest extends WalkerTest { - public void testCounter(String name, String args, String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CountDuplicates" + - " -R " + hg18Reference + - " -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/TCGA-06-0188.aligned.duplicates_marked.bam" + - " -o %s " + args, - 1, // just one output file - Arrays.asList("tmp"), - Arrays.asList(md5)); - List result = executeTest(name, spec).getFirst(); - } - - @Test public void testChr110Mb() { testCounter("testChr1-10mb", "-L chr1:1-10,000,000 --quietLocus", "d3c329a634904d95c4b180d0d63eadfc"); } - @Test public void testIntervalVerbose() { testCounter("testIntervalVerbose", "-L chr1:6,527,154-6,528,292", "5fbb930020df6ca7d0f724524fc43b3e"); } - - public void testCombiner(String name, String args, String md51, String md52) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T CombineDuplicates" + - " -R " + hg18Reference + - " -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/TCGA-06-0188.aligned.duplicates_marked.bam" + - " -o %s --outputBAM %s " + args, - 2, // just one output file - Arrays.asList("tmp", "bam"), - Arrays.asList(md51, md52)); - List result = executeTest(name, spec).getFirst(); - } - - @Test public void testIntervalCombine() { testCombiner("testIntervalCombine", "-L chr1:6,527,154-6,528,292 -maxQ 50", "d41d8cd98f00b204e9800998ecf8427e", "4541f57820637039bc2f5a97bcaadfe4"); } - @Test public void testIntervalCombineQ60() { testCombiner("testIntervalCombine", "-L chr1:6,527,154-6,528,292 -maxQ 60", "d41d8cd98f00b204e9800998ecf8427e", "8c0350c0a697e4083aab6ead3f404de4"); } -} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java deleted file mode 100644 index 6f5e1ad49..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java +++ /dev/null @@ -1,73 +0,0 @@ -// our package -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - - -// the imports for unit testing. - - -import net.sf.samtools.SAMRecord; -import org.apache.commons.lang.StringUtils; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - -/** - * Basic unit test for BaseCounts in reduced reads - */ -public class BaseCountsUnitTest extends BaseTest { - private class SingleTest { - public String bases; - public byte mostCountBase; - public int mostCommonCount; - - private SingleTest(String bases, char mostCountBase, int mostCommonCount) { - this.mostCommonCount = mostCommonCount; - this.mostCountBase = (byte)mostCountBase; - this.bases = bases; - } - } - - - @DataProvider(name = "data") - public Object[][] createData1() { - List params = new ArrayList(); - - params.add(new SingleTest("A", 'A', 1 )); - params.add(new SingleTest("AA", 'A', 2 )); - params.add(new SingleTest("AC", 'A', 1 )); - params.add(new SingleTest("AAC", 'A', 2 )); - params.add(new SingleTest("AAA", 'A', 3 )); - params.add(new SingleTest("AAAN", 'A', 3 )); - params.add(new SingleTest("AAANNNN", 'A', 3 )); - params.add(new SingleTest("AACTG", 'A', 2 )); - params.add(new SingleTest("D", 'D', 1 )); - params.add(new SingleTest("DDAAD", 'D', 3)); - params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 )); - params.add(new SingleTest("AAIIIAI", 'I', 4 )); - - List params2 = new ArrayList(); - for ( SingleTest x : params ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - - - @Test(dataProvider = "data", enabled = true) - public void testCounting(SingleTest params) { - BaseCounts counts = new BaseCounts(); - - for ( byte base : params.bases.getBytes() ) - counts.incr(base); - - String name = String.format("Test-%s", params.bases); - Assert.assertEquals(counts.totalCount(), params.bases.length() - StringUtils.countMatches(params.bases, "N"), name); - Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); - Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpanUnitTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpanUnitTest.java deleted file mode 100644 index acbb481e0..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSpanUnitTest.java +++ /dev/null @@ -1,150 +0,0 @@ -// our package -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - - -// the imports for unit testing. - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * Basic unit test for GenomeLoc - */ -public class ConsensusSpanUnitTest extends BaseTest { - File referenceFile = new File(hg19Reference); - GenomeLocParser genomeLocParser; - IndexedFastaSequenceFile fasta; - GenomeLoc loc; - - @BeforeClass - public void before() { - try { - fasta = new IndexedFastaSequenceFile(referenceFile); - genomeLocParser = new GenomeLocParser(fasta.getSequenceDictionary()); - loc = genomeLocParser.createGenomeLoc("1", 10, 19); - - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } - -// private class BAQTest { -// String readBases, refBases; -// byte[] quals, expected; -// String cigar; -// int refOffset; -// int pos; -// -// public BAQTest(String _refBases, String _readBases, String _quals, String _expected) { -// this(0, -1, null, _readBases, _refBases, _quals, _expected); -// } -// -// public BAQTest(int refOffset, String _refBases, String _readBases, String _quals, String _expected) { -// this(refOffset, -1, null, _refBases, _readBases, _quals, _expected); -// } -// -// public BAQTest(long pos, String cigar, String _readBases, String _quals, String _expected) { -// this(0, pos, cigar, null, _readBases, _quals, _expected); -// } -// -// -// public BAQTest(int _refOffset, long _pos, String _cigar, String _refBases, String _readBases, String _quals, String _expected) { -// refOffset = _refOffset; -// pos = (int)_pos; -// cigar = _cigar; -// readBases = _readBases; -// refBases = _refBases; -// -// quals = new byte[_quals.getBytes().length]; -// expected = new byte[_quals.getBytes().length]; -// for ( int i = 0; i < quals.length; i++) { -// quals[i] = (byte)(_quals.getBytes()[i] - 33); -// expected[i] = (byte)(_expected.getBytes()[i] - 33); -// } -// } -// -// public String toString() { return readBases; } -// -// public SAMRecord createRead() { -// SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, pos > 0 ? pos + (refOffset > 0 ? refOffset : 0): 1, readBases.getBytes(), quals); -// //if ( cigar != null ) read.setAlignmentEnd(readBases.getBytes().length + pos); -// read.setCigarString( cigar == null ? String.format("%dM", quals.length) : cigar); -// return read; -// } -// } - - @Test(enabled = true) - public void testType() { - Assert.assertEquals(ConsensusSpan.Type.CONSERVED, ConsensusSpan.Type.otherType(ConsensusSpan.Type.VARIABLE)); - Assert.assertEquals(ConsensusSpan.Type.VARIABLE, ConsensusSpan.Type.otherType(ConsensusSpan.Type.CONSERVED)); - } - - @Test(enabled = true) - public void testConsensusSpanOffset0() { - ConsensusSpan span = new ConsensusSpan(0, loc, ConsensusSpan.Type.CONSERVED); - Assert.assertEquals(span.getOffsetFromStartOfSites(), 10); - Assert.assertEquals(span.getGenomeStart(), loc.getStart()); - Assert.assertEquals(span.getGenomeStop(), loc.getStop()); - Assert.assertEquals(span.getConsensusType(), ConsensusSpan.Type.CONSERVED); - Assert.assertEquals(span.size(), 10); - } - - @Test(enabled = true) - public void testConsensusSpanOffset10() { - ConsensusSpan span = new ConsensusSpan(10, loc, ConsensusSpan.Type.CONSERVED); - Assert.assertEquals(span.getOffsetFromStartOfSites(), 0); - Assert.assertEquals(span.getGenomeStart(), loc.getStart()); - Assert.assertEquals(span.getGenomeStop(), loc.getStop()); - Assert.assertEquals(span.getConsensusType(), ConsensusSpan.Type.CONSERVED); - Assert.assertEquals(span.size(), 10); - } - - @Test(enabled = true) - public void testConsensusSpanTypes() { - ConsensusSpan conserved = new ConsensusSpan(0, loc, ConsensusSpan.Type.CONSERVED); - Assert.assertEquals(conserved.getConsensusType(), ConsensusSpan.Type.CONSERVED); - Assert.assertTrue(conserved.isConserved()); - Assert.assertFalse(conserved.isVariable()); - - ConsensusSpan variable = new ConsensusSpan(0, loc, ConsensusSpan.Type.VARIABLE); - Assert.assertEquals(variable.getConsensusType(), ConsensusSpan.Type.VARIABLE); - Assert.assertFalse(variable.isConserved()); - Assert.assertTrue(variable.isVariable()); - } - - @Test(enabled = true, expectedExceptions = {Error.class, Exception.class}) - public void testBadSpanCreationBadOffset() { - ConsensusSpan span = new ConsensusSpan(-1, loc, ConsensusSpan.Type.CONSERVED); - } - - @Test(enabled = true, expectedExceptions = {Error.class, Exception.class}) - public void testBadSpanCreationNullLoc() { - ConsensusSpan span = new ConsensusSpan(0, null, ConsensusSpan.Type.CONSERVED); - } - - @Test(enabled = true, expectedExceptions = {Error.class, Exception.class}) - public void testBadSpanCreationNullType() { - ConsensusSpan span = new ConsensusSpan(0, loc, null); - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsIntegrationTest.java deleted file mode 100755 index a6d63e0d4..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReduceReadsIntegrationTest.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class ReduceReadsIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final String L = "20:10,100,000-10,200,000"; - - private void RRTest(String args, String md5) { - String base = String.format("-T ReduceReads -R %s -I %s -L %s", REF, BAM, L) + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); - executeTest("testReduceReads1: args=" + args, spec); - } - - @Test() - public void testReduceReadsBase() { - RRTest("", "9aac475601d80eeb02400cbc73272b26"); - } - - @Test() - public void testReduceReads50MaxReads() { - RRTest(" -mravs 50", "eb2d8c2f1e66d7d0bf767ac55420027e"); - } - - @Test() - public void testReduceReadsMinBasesForConsensus10000() { - RRTest(" -mbrc 10000", "b42706d9a2621b9b63502704af00e0da"); - } - -} - diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java deleted file mode 100644 index 6d03eaf51..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.validation; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * tests for the ROD system in general; from rod system validation to empty VCF files - */ -public class RodSystemValidationIntegrationTest extends WalkerTest { - - public static String baseTestString1KG() { - return "-T RodSystemValidation -o %s -R " + b36KGReference; - } - - - @Test - public void testSimpleGeliPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,GeliText " + validationDataLocation + "ROD_validation/chr1.geli", 1, - Arrays.asList("832efb29a6d4e8dbae374d3eeee17d9d")); - executeTest("testSimpleGeliPileup", spec); - } - - @Test - public void testSimpleVCFPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF3 " + validationDataLocation + "MultiSample.vcf", 1, - Arrays.asList("ad5c01ab5c65877913e885fdb854275c")); - executeTest("testSimpleVCFPileup", spec); - } - - @Test - public void testEmptyVCF() { - File vcf = new File(validationDataLocation + "justHeader.vcf.idx"); - if (vcf.exists()) vcf.delete(); - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF3 " + validationDataLocation + "justHeader.vcf", 1, - Arrays.asList("579456b4da3498e80c42483abbdf5926")); - executeTest("testEmptyVCF", spec); - } - - - @Test - public void testComplexVCFPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF3 " + validationDataLocation + "MultiSample.vcf" + - " -B:eval2,VCF " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4" - , 1, - Arrays.asList("3cabed3262b4474a6316117a13b57edf")); - executeTest("testComplexVCFPileup", spec); - } - - @Test - public void testBTIWithROD() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF3 " + validationDataLocation + "MultiSample.vcf" + - " -B:eval2,VCF " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4" + " -BTI eval" - , 1, - Arrays.asList("12876c0980f6cfeae71386e145ac5c82")); - executeTest("testBTIWithROD", spec); - } - - @Test - public void testLargeComplexVCFPileup() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF3 " + validationDataLocation + "MultiSample.vcf" + - " -B:eval2,VCF3 " + validationDataLocation + "CEU_hapmap_nogt_23.vcf" + - " -B:eval3,VCF3 " + validationDataLocation + "CEU_hapmap_nogt_23.vcf" + - " -L 1 -L 2 -L 20" - , 1, - Arrays.asList("78c4d651d6c0a04b64ccee1dd9d036b9")); - executeTest("testLargeComplexVCFPileup", spec); - } - - //@Test - public void testBlockZippedVrsUnzippedVCF1() { - final String vcfName = validationDataLocation + "bgzipped_vcfs/vcfexample.vcf"; - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:eval,VCF " + vcfName + - " -B:eval2,VCF3 " + vcfName + ".gz" + - " --PerLocusEqual" - , 1, - Arrays.asList("ab3da32eae65e8c15a9f4a787a190a37")); - executeTest("testLargeComplexVCFPileup", spec); - } -} diff --git a/lua/IntervalUtils.lua b/lua/IntervalUtils.lua deleted file mode 100644 index dc377d7f7..000000000 --- a/lua/IntervalUtils.lua +++ /dev/null @@ -1,140 +0,0 @@ ------------------------------------------------------------------------------------------------------------------------- --- Creates a new interval table --- --- Return values: --- 1: Interval table ------------------------------------------------------------------------------------------------------------------------- - -function newInterval(chr, start, finish, strand, info) - return {chr= chr, start=tonumber(start), finish=tonumber(finish), strand=strand, info=info} -end - - ------------------------------------------------------------------------------------------------------------------------- --- Parses a line from an interval list file (not a header line!) --- --- Return values: --- 1: chromosome --- 2: interval start --- 3: interval end --- 4: strand (+/-) --- 5: info field ------------------------------------------------------------------------------------------------------------------------- -local function parseIntervalLine(l) - return l:match("(%w+)%s+(%d+)%s+(%d+)%s+([%+%-])%s+(.*)") -end - ------------------------------------------------------------------------------------------------------------------------- --- Reads an interval list file into a table --- --- Return values: --- 1: table of intervals --- 2: header string ------------------------------------------------------------------------------------------------------------------------- -local function readIntervalList(filename) - t = {} - header = "" - for l in io.lines(filename) do - if l:sub(1,1) == "@" then header = header .. l .."\n" - else - local chr, start, finish, strand, info = parseIntervalLine(l) - table.insert(t, newInterval(chr, start, finish, strand, info)) - end - end - return t, header -end - ------------------------------------------------------------------------------------------------------------------------- --- Checks if two intervals have the same chromosome, start and end. --- --- Return values: --- 1: true/false ------------------------------------------------------------------------------------------------------------------------- -local function isSameInterval (i1, i2) - return i1.chr == i2.chr and i1.start == i2.start and i1.finish == i2.finish -end - ------------------------------------------------------------------------------------------------------------------------- --- Checks if the line from an interval list file is a header line --- --- Return values: --- 1: true/false ------------------------------------------------------------------------------------------------------------------------- -local function isIntervalHeaderLine(l) - return l:sub(1,1) == "@" -end - - ------------------------------------------------------------------------------------------------------------------------- --- Compares the start position of two intervals --- --- Return values: --- 1: -1, 0 or 1 (respectively for a < b, a == b, a > b) ------------------------------------------------------------------------------------------------------------------------- -local function compIntervals(a, b) - - local function c(x,y) - if x < y then return -1 - elseif x > y then return 1 - else return 0 end - end - -- same chromosomes - if a.chr == b.chr then return c(a.start, b.start) - else - x = tonumber(a.chr) - y = tonumber(b.chr) - if x and y then return c(x,y) - else return c(a.chr, b.chr) end - end -end - ------------------------------------------------------------------------------------------------------------------------- --- Compare function to sort a list of intervals (use with table.sort) --- --- Return values: --- 1: true if a < b, false otherwise. ------------------------------------------------------------------------------------------------------------------------- -local function sortCompInterval(a, b) - if a.chr == b.chr then return a.start < b.start end - local x = tonumber(a.chr) - local y = tonumber(b.chr) - if x and y then - return x < y end - return a.chr < b.chr -end - ------------------------------------------------------------------------------------------------------------------------- --- Checks if the interval is a valid human genome interval --- --- Return values: --- 1: true/false ------------------------------------------------------------------------------------------------------------------------- -local function isValidInterval(interval) - local x - if interval.chr == "X" then x = 23 - elseif interval.chr == "Y" then x = 24 - elseif interval.chr == "MT" then x = 25 - else x = tonumber(interval.chr) end - return x >= 1 and x <= 25 and interval.start < interval.finish and chr_limits[x] > interval.finish -end - - ------------------------------------------------------------------------------------------------------------------------- --- Checks if the intervals are overlapping. Intervals are said to overlap if one of the following is true: --- i1i2: i1 starts before i2, but ends inside i2. --- i2i1: i2 starts before i1, but ends inside i1. --- i1_inside: i1 is fully contained inside i2. --- i2_inside: i2 is fully contained inside i1. --- --- Return values: --- 1: true/false --- 2: if true, returns "i1i2", "i2i1", "i1_inside", "i2_inside" ------------------------------------------------------------------------------------------------------------------------- -local function isOverlappingInterval(i1, i2) - if i1.chr ~= i2.chr then return false - elseif i1.start < i2.start and i1.finish < i2.finish then return true, "i1i2" - elseif i2.start < i1.start and i2.finish < i1.finish then return true, "i2i1" - elseif i1.start > i2.start and i1.finish < i2.finish then return true, "i1_inside" - elseif i2.start > i1.start and i2.finish < i1.finish then return true, "i2_inside" - else return false end -end \ No newline at end of file diff --git a/lua/chunkIntervals.lua b/lua/chunkIntervals.lua deleted file mode 100644 index c8ed48aa2..000000000 --- a/lua/chunkIntervals.lua +++ /dev/null @@ -1,21 +0,0 @@ -local infile = arg[1] or io.stdin -local outfile = (arg[2] and io.open(arg[2], "w")) or io.stdout - - -local function cutcommas(x) - local n = "" - for v in x:gmatch("(%d+)") do - n = n .. v - end - if n == "" then return tonumber(x) else return tonumber(n) end -end - -for l in io.lines(infile) do - local chr, startPos, endPos = l:match("(.*):(%d+)%-([%d,]+)") - startPos = cutcommas(startPos) - endPos = cutcommas(endPos) - for i=startPos,endPos,1000000 do - if endPos > i+999999 then outfile:write(chr..":"..i.."-"..i+999999 .."\n") - else outfile:write(chr..":"..i.."-"..endPos.."\n") end - end -end \ No newline at end of file diff --git a/lua/findGenes.lua b/lua/findGenes.lua deleted file mode 100644 index fd43cf610..000000000 --- a/lua/findGenes.lua +++ /dev/null @@ -1,298 +0,0 @@ --- This script parses the table of genes from http://geneticassociationdb.nih.gov/ --- and generates an annotated interval list for use with the GATK --- --- Author: carneiro --- Date: 5/24/2011 - - -ref_table = { - "ID", - "AS", - "PH", - "DI", - "DC", - "DT", - "CH", - "CB", - "GE", - "ST", - "SP", - "PV", - "RE", - "PI", - "AA", - "AF", - "PC", - "GN", - "RS", - "PO", - "GO", - "SU", - "LN", - "UN", - "NP", - "MP", - "JO", - "TI", - "RN", - "OM", - "YR", - "CN", - "SI", - "EF", - "GIGA", - "GIAA", - "GIGB", - "GIAB", - "GIGC", - "GIAC", - "GIAS", - "GIEF" -} - -chr_limits = { - 249250621, - 243199373, - 198022430, - 191154276, - 180915260, - 171115067, - 159138663, - 146364022, - 141213431, - 135534747, - 135006516, - 133851895, - 115169878, - 107349540, - 102531392, - 90354753, - 81195210, - 78077248, - 59128983, - 63025520, - 48129895, - 51304566, - 155270560, - 59373566, - 16569 -} - -local header = [[@HD VN:1.0 SO:coordinate -@SQ SN:1 LN:249250621 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Homo Sapiens -@SQ SN:2 LN:243199373 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a0d9851da00400dec1098a9255ac712e SP:Homo Sapiens -@SQ SN:3 LN:198022430 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdfd811849cc2fadebc929bb925902e5 SP:Homo Sapiens -@SQ SN:4 LN:191154276 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:23dccd106897542ad87d2765d28a19a1 SP:Homo Sapiens -@SQ SN:5 LN:180915260 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0740173db9ffd264d728f32784845cd7 SP:Homo Sapiens -@SQ SN:6 LN:171115067 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d3a93a248d92a729ee764823acbbc6b SP:Homo Sapiens -@SQ SN:7 LN:159138663 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:618366e953d6aaad97dbe4777c29375e SP:Homo Sapiens -@SQ SN:8 LN:146364022 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96f514a9929e410c6651697bded59aec SP:Homo Sapiens -@SQ SN:9 LN:141213431 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e273117f15e0a400f01055d9f393768 SP:Homo Sapiens -@SQ SN:10 LN:135534747 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:988c28e000e84c26d552359af1ea2e1d SP:Homo Sapiens -@SQ SN:11 LN:135006516 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98c59049a2df285c76ffb1c6db8f8b96 SP:Homo Sapiens -@SQ SN:12 LN:133851895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:51851ac0e1a115847ad36449b0015864 SP:Homo Sapiens -@SQ SN:13 LN:115169878 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:283f8d7892baa81b510a015719ca7b0b SP:Homo Sapiens -@SQ SN:14 LN:107349540 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98f3cae32b2a2e9524bc19813927542e SP:Homo Sapiens -@SQ SN:15 LN:102531392 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e5645a794a8238215b2cd77acb95a078 SP:Homo Sapiens -@SQ SN:16 LN:90354753 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc9b1a7b42b97a864f56b348b06095e6 SP:Homo Sapiens -@SQ SN:17 LN:81195210 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:351f64d4f4f9ddd45b35336ad97aa6de SP:Homo Sapiens -@SQ SN:18 LN:78077248 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c SP:Homo Sapiens -@SQ SN:19 LN:59128983 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1aacd71f30db8e561810913e0b72636d SP:Homo Sapiens -@SQ SN:20 LN:63025520 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens -@SQ SN:21 LN:48129895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2979a6085bfe28e3ad6f552f361ed74d SP:Homo Sapiens -@SQ SN:22 LN:51304566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a718acaa6135fdca8357d5bfe94211dd SP:Homo Sapiens -@SQ SN:X LN:155270560 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7e0e2e580297b7764e31dbc80c2540dd SP:Homo Sapiens -@SQ SN:Y LN:59373566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1fa3474750af0948bdf97d5a0ee52e51 SP:Homo Sapiens -@SQ SN:MT LN:16569 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c68f52674c9fb33aef52dcf399755519 SP:Homo Sapiens -@SQ SN:GL000207.1 LN:4262 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f3814841f1939d3ca19072d9e89f3fd7 SP:Homo Sapiens -@SQ SN:GL000226.1 LN:15008 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1c1b2cd1fccbc0a99b6a447fa24d1504 SP:Homo Sapiens -@SQ SN:GL000229.1 LN:19913 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d0f40ec87de311d8e715b52e4c7062e1 SP:Homo Sapiens -@SQ SN:GL000231.1 LN:27386 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ba8882ce3a1efa2080e5d29b956568a4 SP:Homo Sapiens -@SQ SN:GL000210.1 LN:27682 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:851106a74238044126131ce2a8e5847c SP:Homo Sapiens -@SQ SN:GL000239.1 LN:33824 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:99795f15702caec4fa1c4e15f8a29c07 SP:Homo Sapiens -@SQ SN:GL000235.1 LN:34474 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:118a25ca210cfbcdfb6c2ebb249f9680 SP:Homo Sapiens -@SQ SN:GL000201.1 LN:36148 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dfb7e7ec60ffdcb85cb359ea28454ee9 SP:Homo Sapiens -@SQ SN:GL000247.1 LN:36422 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7de00226bb7df1c57276ca6baabafd15 SP:Homo Sapiens -@SQ SN:GL000245.1 LN:36651 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:89bc61960f37d94abf0df2d481ada0ec SP:Homo Sapiens -@SQ SN:GL000197.1 LN:37175 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6f5efdd36643a9b8c8ccad6f2f1edc7b SP:Homo Sapiens -@SQ SN:GL000203.1 LN:37498 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96358c325fe0e70bee73436e8bb14dbd SP:Homo Sapiens -@SQ SN:GL000246.1 LN:38154 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e4afcd31912af9d9c2546acf1cb23af2 SP:Homo Sapiens -@SQ SN:GL000249.1 LN:38502 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d78abec37c15fe29a275eb08d5af236 SP:Homo Sapiens -@SQ SN:GL000196.1 LN:38914 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d92206d1bb4c3b4019c43c0875c06dc0 SP:Homo Sapiens -@SQ SN:GL000248.1 LN:39786 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5a8e43bec9be36c7b49c84d585107776 SP:Homo Sapiens -@SQ SN:GL000244.1 LN:39929 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0996b4475f353ca98bacb756ac479140 SP:Homo Sapiens -@SQ SN:GL000238.1 LN:39939 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:131b1efc3270cc838686b54e7c34b17b SP:Homo Sapiens -@SQ SN:GL000202.1 LN:40103 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:06cbf126247d89664a4faebad130fe9c SP:Homo Sapiens -@SQ SN:GL000234.1 LN:40531 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:93f998536b61a56fd0ff47322a911d4b SP:Homo Sapiens -@SQ SN:GL000232.1 LN:40652 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e06b6741061ad93a8587531307057d8 SP:Homo Sapiens -@SQ SN:GL000206.1 LN:41001 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:43f69e423533e948bfae5ce1d45bd3f1 SP:Homo Sapiens -@SQ SN:GL000240.1 LN:41933 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:445a86173da9f237d7bcf41c6cb8cc62 SP:Homo Sapiens -@SQ SN:GL000236.1 LN:41934 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdcd739913efa1fdc64b6c0cd7016779 SP:Homo Sapiens -@SQ SN:GL000241.1 LN:42152 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ef4258cdc5a45c206cea8fc3e1d858cf SP:Homo Sapiens -@SQ SN:GL000243.1 LN:43341 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:cc34279a7e353136741c9fce79bc4396 SP:Homo Sapiens -@SQ SN:GL000242.1 LN:43523 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2f8694fc47576bc81b5fe9e7de0ba49e SP:Homo Sapiens -@SQ SN:GL000230.1 LN:43691 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b4eb71ee878d3706246b7c1dbef69299 SP:Homo Sapiens -@SQ SN:GL000237.1 LN:45867 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e0c82e7751df73f4f6d0ed30cdc853c0 SP:Homo Sapiens -@SQ SN:GL000233.1 LN:45941 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7fed60298a8d62ff808b74b6ce820001 SP:Homo Sapiens -@SQ SN:GL000204.1 LN:81310 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:efc49c871536fa8d79cb0a06fa739722 SP:Homo Sapiens -@SQ SN:GL000198.1 LN:90085 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:868e7784040da90d900d2d1b667a1383 SP:Homo Sapiens -@SQ SN:GL000208.1 LN:92689 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:aa81be49bf3fe63a79bdc6a6f279abf6 SP:Homo Sapiens -@SQ SN:GL000191.1 LN:106433 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d75b436f50a8214ee9c2a51d30b2c2cc SP:Homo Sapiens -@SQ SN:GL000227.1 LN:128374 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a4aead23f8053f2655e468bcc6ecdceb SP:Homo Sapiens -@SQ SN:GL000228.1 LN:129120 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c5a17c97e2c1a0b6a9cc5a6b064b714f SP:Homo Sapiens -@SQ SN:GL000214.1 LN:137718 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:46c2032c37f2ed899eb41c0473319a69 SP:Homo Sapiens -@SQ SN:GL000221.1 LN:155397 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3238fb74ea87ae857f9c7508d315babb SP:Homo Sapiens -@SQ SN:GL000209.1 LN:159169 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f40598e2a5a6b26e84a3775e0d1e2c81 SP:Homo Sapiens -@SQ SN:GL000218.1 LN:161147 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d708b54644c26c7e01c2dad5426d38c SP:Homo Sapiens -@SQ SN:GL000220.1 LN:161802 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc35de963c57bf7648429e6454f1c9db SP:Homo Sapiens -@SQ SN:GL000213.1 LN:164239 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:9d424fdcc98866650b58f004080a992a SP:Homo Sapiens -@SQ SN:GL000211.1 LN:166566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7daaa45c66b288847b9b32b964e623d3 SP:Homo Sapiens -@SQ SN:GL000199.1 LN:169874 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:569af3b73522fab4b40995ae4944e78e SP:Homo Sapiens -@SQ SN:GL000217.1 LN:172149 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6d243e18dea1945fb7f2517615b8f52e SP:Homo Sapiens -@SQ SN:GL000216.1 LN:172294 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:642a232d91c486ac339263820aef7fe0 SP:Homo Sapiens -@SQ SN:GL000215.1 LN:172545 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5eb3b418480ae67a997957c909375a73 SP:Homo Sapiens -@SQ SN:GL000205.1 LN:174588 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d22441398d99caf673e9afb9a1908ec5 SP:Homo Sapiens -@SQ SN:GL000219.1 LN:179198 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f977edd13bac459cb2ed4a5457dba1b3 SP:Homo Sapiens -@SQ SN:GL000224.1 LN:179693 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d5b2fc04f6b41b212a4198a07f450e20 SP:Homo Sapiens -@SQ SN:GL000223.1 LN:180455 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:399dfa03bf32022ab52a846f7ca35b30 SP:Homo Sapiens -@SQ SN:GL000195.1 LN:182896 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5d9ec007868d517e73543b005ba48535 SP:Homo Sapiens -@SQ SN:GL000212.1 LN:186858 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:563531689f3dbd691331fd6c5730a88b SP:Homo Sapiens -@SQ SN:GL000222.1 LN:186861 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6fe9abac455169f50470f5a6b01d0f59 SP:Homo Sapiens -@SQ SN:GL000200.1 LN:187035 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:75e4c8d17cd4addf3917d1703cacaf25 SP:Homo Sapiens -@SQ SN:GL000193.1 LN:189789 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dbb6e8ece0b5de29da56601613007c2a SP:Homo Sapiens -@SQ SN:GL000194.1 LN:191469 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6ac8f815bf8e845bb3031b73f812c012 SP:Homo Sapiens -@SQ SN:GL000225.1 LN:211173 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:63945c3e6962f28ffd469719a747e73c SP:Homo Sapiens -@SQ SN:GL000192.1 LN:547496 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:325ba9e808f669dfeee210fdd7b470ac SP:Homo Sapiens -@SQ SN:NC_007605 LN:171823 AS:NC_007605.1 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6743bd63b3ff2b5b8985d8933c53290a SP:Epstein-Barr virus]] - -local function comp(a, b) - if a.chr == b.chr then return a.start < b.start end - local x = tonumber(a.chr) - local y = tonumber(b.chr) - if x and y then - return x < y end - return a.chr < b.chr -end - -local function cleanNumber (x) - local t = x:gsub(",","") - return tonumber(t) -end - -local function cleanDisease (x) - local d = x:gsub("[^%w%s]", "") - return d:gsub(" ", "_") -end - -local function isSameGene (g1, g2) - return g1.chr == g2.chr and - g1.start == g2.start and - g1.finish == g2.finish -end - -local function isValidGene(gene) - local x - if gene.chr == "X" then x = 23 - elseif gene.chr == "Y" then x = 24 - elseif gene.chr == "MT" then x = 25 - else x = tonumber(gene.chr) end - - return x >= 1 and x <= 25 and gene.start < gene.finish and chr_limits[x] > gene.finish -end - - -local function addGene(geneTable, gene) - if isValidGene(gene) then -- only adds valid genes - local geneAdded = false - for _, g in ipairs(geneTable) do - if isSameGene(gene, g) then - geneAdded = true - if g.disease ~= gene.disease then - g.disease = g.disease .. ","..gene.disease - end - end - end - if (not geneAdded) then - table.insert(geneTable, gene) - end - end -end - -local function cleanIntervals(f, t) - for l in io.lines(f) do - local counter = 1 - local id, c, b, e, disease - for match in l:gmatch("(.-);;") do - -- print("DEBUG: ", counter, match) - if counter == 1 then id = match - elseif counter == 3 then disease = cleanDisease(match) - elseif counter == 7 then c = match - elseif counter == 10 then b = cleanNumber(match) - elseif counter == 11 then e = cleanNumber(match) - end - counter = counter + 1 - end - if id and c and b and e and c~= "" then - addGene(t, {id=id, chr=c, start=b, finish=e, disease=disease}) - end - end - return table.sort(t, comp) -end - -local function printIntervals(t) - print(header) - for _,interval in ipairs(t) do - print(interval.chr,interval.start,interval.finish, "+", interval.id..":::"..interval.disease) - end -end - -t = {} -cleanIntervals(arg[1], t) -printIntervals(t) - ---[[ --- REFERENCE TABLE - -1 ID -2 Association(Y/N) -3 Broad Phenotype -4 Disease Class -5 Disease Class Code -6 MeSH Disease Terms -7 Chromosom -8 Chr-Band -9 Gene -10 DNA Start -11 DNA End -12 P Value -13 Reference -14 Pubmed ID -15 Allele Author Description -16 Allele Functional Effects -17 Polymophism Class -18 Gene Name -19 RefSeq -20 Population -21 MeSH Geolocation -22 Submitter -23 Locus Number -24 Unigene -25 Narrow Phenotype -26 Mole. Phenotype -27 Journal -28 Title -29 rs Number -30 OMIM ID -31 Year -32 Conclusion -33 Study Info -34 Env. Factor -35 GI Gene A -36 GI Allele of Gene A -37 GI Gene B -38 GI Allele of Gene B -39 GI Gene C -40 GI Allele of Gene C -41 GI Association? -42 GI combine Env. Factor ---]] \ No newline at end of file diff --git a/lua/hasTheseTargets.lua b/lua/hasTheseTargets.lua deleted file mode 100644 index 3a6682de1..000000000 --- a/lua/hasTheseTargets.lua +++ /dev/null @@ -1,143 +0,0 @@ --- This script takes two interval lists (a and b) and creates another interval list (c) with the intervals in a that are --- inside the targets in b. For example, if a is a list of exon targets and b a list of genes, then c would be all exons --- covered by the genes in file b. --- --- Author: carneiro --- Date: 5/26/2011 - -local targetList = arg[1] -- list of targets to check if they are part of the set -local targetSet = arg[2] -- set of targets - - ------------------------------------------------------------------------------------------------------------------------- --- Parses a line from an interval list file (not a header line!) --- --- Return values: --- 1: chromosome --- 2: interval start --- 3: interval end --- 4: strand (+/-) --- 5: info field ------------------------------------------------------------------------------------------------------------------------- -local function parseIntervalLine(l) - return l:match("(%w+)%s+(%d+)%s+(%d+)%s+([%+%-])%s+(.*)") -end - ------------------------------------------------------------------------------------------------------------------------- --- Reads an interval list file into a table --- --- Return values: --- 1: index pairs table with each interval (each interval is a table of the form {c, s, e, strand, info}) --- 2: header string ------------------------------------------------------------------------------------------------------------------------- -local function readIntervals(filename) - t = {} - header = "" - for l in io.lines(filename) do - if l:sub(1,1) == "@" then header = header .. l .."\n" - else - local c, s, e, strand, info = parseIntervalLine(l) - table.insert(t, {c=c, s=tonumber(s), e=tonumber(e), strand=strand, info=info}) - end - end - return t, header -end - ------------------------------------------------------------------------------------------------------------------------- --- Checks if two intervals have the same chromosome, start and end. --- --- Return values: --- 1: true/false ------------------------------------------------------------------------------------------------------------------------- -local function isSameInterval(i, c, s, e) - return i.c == c and s == i.s and e == i.e -end - ------------------------------------------------------------------------------------------------------------------------- --- Checks if the line from an interval list file is a header line --- --- Return values: --- 1: true/false ------------------------------------------------------------------------------------------------------------------------- -local function isIntervalHeaderLine(l) - return l:sub(1,1) == "@" -end - - ------------------------------------------------------------------------------------------------------------------------- --- Compares the start position of two intervals --- --- Return values: --- 1: -1, 0 or 1 (respectively for a < b, a == b, a > b) ------------------------------------------------------------------------------------------------------------------------- -local function compIntervals(a, b) - - local function c(x,y) - if x < y then return -1 - elseif x > y then return 1 - else return 0 end - end - -- same chromosomes - if a.c == b.c then return c(a.s, b.s) - else - x = tonumber(a.c) - y = tonumber(b.c) - if x and y then return c(x,y) - else return c(a.c, b.c) end - end -end - ------------------------------------------------------------------------------------------------------------------------- --- Creates a new interval --- --- Return values: --- 1: interval table {c, s, e, st, info} ------------------------------------------------------------------------------------------------------------------------- -local function newInterval(c,s,e,st,info) - return {c=c, s=tonumber(s), e=tonumber(e), st=st, info=info } -end - -local function printInterval(i) - print(i.c, i.s, i.e, i.st, i.info) -end - -local function intervalContainsInterval(a, b) - return a.c == b.c and a.s <= b.s and a.e >= b.e -end - -local function isInterceptingInterval(a, b) - return a.s < b.s and a.e < b.e and a.e > b.s -end - -local function findInterval(i, intervals) - local start = 1 - local finish = table.getn(intervals) - local current = math.floor((start + finish) / 2) - - while start < finish and not intervalContainsInterval(i, intervals[current]) and not isInterceptingInterval(i, intervals[current]) do - if compIntervals(i, intervals[current]) < 0 then - finish = current - 1 - else - start = current + 1 - end - current = math.floor((start + finish) / 2) - end - return intervalContainsInterval(i, intervals[current]), current -end - - - - -local a, header = readIntervals(targetList) -io.write(header) - -for l in io.lines(targetSet) do - if not isIntervalHeaderLine(l) then - local c, s, e, st, info = parseIntervalLine(l) - local intA = newInterval(c,s,e,st,info) - local intervalExists, i = findInterval(intA, a) - if intervalExists then - print(a[i].c, a[i].s, a[i].e, st, info) - end - end -end diff --git a/lua/mergeIntervals.lua b/lua/mergeIntervals.lua deleted file mode 100644 index 0a505c302..000000000 --- a/lua/mergeIntervals.lua +++ /dev/null @@ -1,71 +0,0 @@ --- Merges all intervals into a sorted list of unoverlapping intervals --- --- ps: this utility is similar to -im on GATK, but for queue scripts we need the intervals --- file fixed before we can run Unified Genotyper because of scatter-gather. --- --- Usage: --- lua MergeIntervals.lua raw.intervals > merged.intervals --- - -assert(table.getn(arg) > 0, "\n\nMissing input file\n\nUsage:\n\tlua MergeIntervals.lua raw.intervals > merged.intervals\n\n") - - -local function comp(a, b) - if tonumber(a) and tonumber(b) then return tonumber(a) < tonumber(b) end - return a lastPositionRead do - if (intervalStart < 0 or i ~= (intervalStart + intervalSize[c][intervalStart] + 1)) then - intervalStart = i - intervalSize[c][intervalStart] = 0 - table.insert(intervalsIndex[c], intervalStart) - else - intervalSize[c][intervalStart] = intervalSize[c][intervalStart] + 1 - end - lastPositionRead = i - i = i + 1 - end - end -end - -table.sort(sortedChrs, comp) -for _,c in pairs(sortedChrs) do - table.sort(intervalsIndex[c]) - for _,intervalStart in ipairs(intervalsIndex[c]) do - if intervalSize[c][intervalStart] > 0 then - print(c..":"..intervalStart.."-"..intervalStart + intervalSize[c][intervalStart]) - end - end -end diff --git a/lua/remapAmplicons.lua b/lua/remapAmplicons.lua deleted file mode 100644 index 22bd90404..000000000 --- a/lua/remapAmplicons.lua +++ /dev/null @@ -1,86 +0,0 @@ -local ampFile = arg[1] -local samFile = arg[2] -local headerFile = io.open(arg[2]:match("(.*).sam")..".header.sam", "w") -local bodyFile = io.open(arg[2]:match("(.*).sam")..".body.sam", "w") - - --- These sizes are hardcoded for hg19, but future versions of this script should optionally take a .fai file to build this table. -chrlength = {} -chrlength["1"] =249250621 -chrlength["2"] =243199373 -chrlength["3"] =198022430 -chrlength["4"] =191154276 -chrlength["5"] =180915260 -chrlength["6"] =171115067 -chrlength["7"] =159138663 -chrlength["8"] =146364022 -chrlength["9"] =141213431 -chrlength["10"]=135534747 -chrlength["11"]=135006516 -chrlength["12"]=133851895 -chrlength["13"]=115169878 -chrlength["14"]=107349540 -chrlength["15"]=102531392 -chrlength["16"]=90354753 -chrlength["17"]=81195210 -chrlength["18"]=78077248 -chrlength["19"]=59128983 -chrlength["20"]=63025520 -chrlength["21"]=48129895 -chrlength["22"]=51304566 -chrlength["X"] =155270560 -chrlength["Y"] =59373566 -chrlength["MT"]=16569 - -local header = {} -local reads = {} - --- reads the amplicon file (global var) and returns an amplicon hash table indexed by amplicon name -local function readAmpliconFile() - local amplicons = {} - for l in io.lines(ampFile) do - local chr, startPos, endPos, ampName = l:match("([%w]+)%s+(%d+)%s+(%d+)%s+([%w%p_]+)") - amplicons[ampName] = {chr=chr, startPos=tonumber(startPos), endPos=tonumber(endPos)} - end - return amplicons -end - --- updates the global header table with the entries -local function processSamHeaderLine(l, amplicons) - if l:sub(1,3) == "@HD" then header.hd = l - elseif l:sub(1,3) == "@CO" then header.co = l - else - if not header.sq then header.sq = {} end - local ampName = l:match("@SQ%s+SN:ps%d+_([%w%p_]+).*") - local chr = amplicons[ampName].chr - header.sq[chr] = chrlength[chr] - end -end - -local function printHeader() - if header.hd then headerFile:write(header.hd.."\n") else headerFile:write("@HD VN:PB_v0.1") end - for chr, len in pairs(header.sq) do - headerFile:write("@SQ\tSN:"..chr.."\tLN:"..len.."\n") - end - if header.co then headerFile:write(header.co.."\n") end -end - -local function printReads() - for _, v in ipairs(reads) do bodyFile:write(v.."\n") end -end - - -local amplicons = readAmpliconFile() -- amplicons indexed by name - ---for i,v in pairs(amplicons) do print("'"..i.."'") end - -for l in io.lines(samFile) do - if l:sub(1,1) == "@" then processSamHeaderLine(l, amplicons) - else - local before, amp, mapStart, after = l:match("(%d+%s+%d+%s+)ps%d+_([%w%p_]+)%s+(%d+)(.*)") - table.insert(reads, before..amplicons[amp].chr.."\t"..amplicons[amp].startPos + mapStart - 1 ..after) - end -end - -printHeader() -printReads() diff --git a/lua/sortGenesByCoverage.lua b/lua/sortGenesByCoverage.lua deleted file mode 100644 index 6860976fe..000000000 --- a/lua/sortGenesByCoverage.lua +++ /dev/null @@ -1,64 +0,0 @@ --- This script takes the interval summary output from GATK's Depth of Coverage and the --- interval list for 'genes of interest' and generates an intervals file sorted by --- coverage. --- --- Author: carneiro --- Date: 5/26/2011 -------------------------------------------------------------------------------- --- Global script variables -------------------------------------------------------------------------------- -local coverage = arg[1] -local genesOfInterest= arg[2] - -local genes = {} -local header = "" - -------------------------------------------------------------------------------- --- Helper functions -------------------------------------------------------------------------------- - -local function isSameGene(i, c, s, e) - return genes[i].c == c and s == genes[i].s and e == genes[i].e -end - -local function isMergedGene(i, c, s, e) - return genes[i].c == c and s <= genes[i].s and e >= genes[i].e -end - -local function comp(a,b) - return a.avgCoverage < b.avgCoverage -end - -for l in io.lines(genesOfInterest) do - if l:sub(1,1) == "@" then header = header .. l .."\n" - else - local c, s, e, info = l:match("(%w+)%s+(%d+)%s+(%d+)%s+%+%s+(.*)") - table.insert(genes, {c=c, s=tonumber(s), e=tonumber(e), info=info}) - end -end - -local i = 1 -for l in io.lines(coverage) do - local geneOk = false - if l:match("(%w+)") ~= "Target" then -- skip the first line (header) - local c, s, e, totalCoverage, avgCoverage = l:match("(%w+):(%d+)%-(%d+)%s+(%d+)%s+([%d%.]+)") - s = tonumber(s) - e = tonumber(e) - while genes[i] ~= nil and (isSameGene(i, c, s, e) or isMergedGene(i, c, s, e)) do - genes[i].totalCoverage = tonumber(totalCoverage) - genes[i].avgCoverage = tonumber(avgCoverage) - geneOk = true - i = i + 1 - end - if not geneOk then - error("Warning: Gene mismatch! Crazy!", c,s,e,"--",genes[i].c, genes[i].s, genes[i].e) - break - end - end -end - -table.sort(genes, comp) -io.write(header) -for _, g in ipairs(genes) do - print(g.c,g.s,g.e,"+",g.avgCoverage..":::"..g.totalCoverage..":::"..g.info) -end diff --git a/lua/updateSampleList.lua b/lua/updateSampleList.lua deleted file mode 100644 index 8df42373b..000000000 --- a/lua/updateSampleList.lua +++ /dev/null @@ -1,20 +0,0 @@ ---[[ --- Updates a list of BAM files to the latest version in the picard aggregation path --- Usage: --- --- lua updateSampleList.lua samples.list > updated_samples.list - ]] -function latestVersion(sample) - local version = tonumber(sample:match("/v(%d+)/")) - f = io.open(sample) - while (f == nil) do - version = version + 1 - sample = sample:gsub("/v(%d+)/", "/v"..version.."/") - f = io.open(sample) - end - return(sample) -end - -for sample in io.lines(arg[1]) do - print(latestVersion(sample)) -end diff --git a/matlab/make_cdkn2_power_cvg_plots.m b/matlab/make_cdkn2_power_cvg_plots.m deleted file mode 100755 index 161281a31..000000000 --- a/matlab/make_cdkn2_power_cvg_plots.m +++ /dev/null @@ -1,225 +0,0 @@ -clear all -close all - -need_to_create_metrics_file = false; - -if ( need_to_create_metrics_file ) - production_names = {'FHS_P10_766_CDKN2'; ... - 'FHS_P113_140_CDKN2'; ... - 'FHS_P11_767_CDKN2'; ... - 'FHS_P13_769_CDKN2'; ... - 'FHS_P141_168_CDKN2'; ... - 'FHS_P14_770_CDKN2'; ... - 'FHS_P15_771_CDKN2'; ... - 'FHS_P169_196_CDKN2'; ... - 'FHS_P17_773_CDKN2'; ... - 'FHS_P197_224_CDKN2'; ... - 'FHS_P1_757_CDKN2'; ... - 'FHS_P225_252_CDKN2'; ... - 'FHS_P26_782_CDKN2'; ... - 'FHS_P27_783_CDKN2'; ... - 'FHS_P28_784_CDKN2'; ... - 'FHS_P2_758_CDKN2'; ... - 'FHS_P309_336_CDKN2'; ... - 'FHS_P365_392_CDKN2'; ... - 'FHS_P393_420_CDKN2'; ... - 'FHS_P3_759_CDKN2'; ... - 'FHS_P421_448_CDKN2'; ... - 'FHS_P449_476_CDKN2'; ... - 'FHS_P477_504_CDKN2'; ... - 'FHS_P4_760_CDKN2'; ... - 'FHS_P505_532_CDKN2'; ... - 'FHS_P533_560_CDKN2'; ... - 'FHS_P561_588_CDKN2'; ... - 'FHS_P57_84_CDKN2'; ... - 'FHS_P5_761_CDKN2'; ... - 'FHS_P617_644_CDKN2'; ... - 'FHS_P645_672_CDKN2'; ... - 'FHS_P673_700_CDKN2'; ... - 'FHS_P6_762_CDKN2'; ... - 'FHS_P701_728_CDKN2'; ... - 'FHS_P729_756_CDKN2'; ... - 'FHS_P757_784_CDKN2'; ... - 'FHS_P85_112_CDKN2'; ... - 'FHS_P8_764_CDKN2'; ... - 'FHS_P9_765_CDKN2';}; - - production_dir = '/humgen/gsa-hphome1/projects/FHS/production/analysis/'; - - - pilot_names = {'CEPH1_CDKN2'; 'CEPH2_CDKN2'; 'CEPH3_CDKN2'}; - pilot_dir = '/humgen/gsa-hphome1/projects/FHS/pilot/analysis/'; - - - power_ext = '_power.txt'; - cvg_ext = '_coverage.txt'; - - - production_cvg = []; - production_power = []; - pilot_cvg = []; - - % load production data - - for ii = 1:length(production_names) - pow_file = strcat(production_dir,production_names{ii},power_ext); - [chrompos, data, u1, u2, u3, u4] = textread(pow_file,'%s\t%f\t%f\t%f\t%f\t%f','headerlines',1); - production_power = [production_power data(:,1)]; - cvg_file = strcat(production_dir,production_names{ii},cvg_ext); - [chrompos,data] = textread(cvg_file,'%s\t%d','headerlines',1); - production_cvg = [production_cvg data(:,1)]; - end - - % load pilot data - - for ii = 1:length(pilot_names) - cvg_file = strcat(pilot_dir,pilot_names{ii},cvg_ext); - [chrompos,data] = textread(cvg_file,'%s\t%d','headerlines',1); - pilot_cvg = [pilot_cvg data(:,1)]; - end - - % grab the raw positions on chromosome 9 - - pos = []; - - for ii = 1:length(chrompos) - g = chrompos{ii}; - h = find(g == ':'); - pos = [pos; str2num(g(h+1:end))]; - end - - % import the amplicon list - - [amp_start, amp_end] = textread('/humgen/gsa-hphome1/projects/FHS/interval_lists/chr9_amplicons.interval_list','%d\t%d'); - - % now make a plottable target_region variable - % and an amplicon_start variable in that same space - target = 0; - target_region = zeros(size(pos)); - amplicon_start = zeros(size(pos)); - prevPos = pos(1) - 1; - - for ii = 1:length(pos) - if ( pos(ii) == prevPos - 1 ) - target_region(ii) = target; - target = target + 1; - else - target_region(ii) = target + 50; - target = target + 51; - end - - if ( any(pos(ii) == amp_start) ) - amplicon_start(ii) = 1; % yes we want this to be boolean - end - - end - - - % now save the data - - save('framingham_gene_CDKN2_metrics.mat','pilot_cvg','production_cvg','production_power','amplicon_start','target_region') - - -else - - - load framingham_gene_CDKN2_metrics.mat - - % calculate median and mean - mean_power = mean(production_power,2); - mean_production_cvg = mean(production_cvg,2); - mean_pilot_cvg = mean(pilot_cvg,2); - median_power = median(production_power,2); - median_production_cvg = median(production_cvg,2); - median_pilot_cvg = median(pilot_cvg,2); - - % compute quartiles - - power_quartile1 = zeros(size(production_power(:,1))); - power_quartile3 = zeros(size(power_quartile1)); - depth_prod_quartile1 = zeros(size(power_quartile1)); - depth_pilot_quartile1 = zeros(size(power_quartile1)); - depth_prod_quartile3 = zeros(size(power_quartile1)); - depth_pilot_quartile3 = zeros(size(power_quartile1)); - - for ii = 1 : length(power_quartile1) - power_quartile1(ii) = median(production_power(ii,find(production_power(ii,:) < median_power(ii)))')'; - power_quartile3(ii) = median(production_power(ii,find(production_power(ii,:) > median_power(ii)))')'; - depth_prod_quartile1(ii) = median(production_cvg(ii,find(production_cvg(ii,:) < median_production_cvg(ii)))')'; - depth_prod_quartile3(ii) = median(production_cvg(ii,find(production_cvg(ii,:) > median_production_cvg(ii)))')'; - depth_pilot_quartile1(ii) = median(pilot_cvg(ii,find(pilot_cvg(ii,:) < median_pilot_cvg(ii)))')'; - depth_pilot_quartile3(ii) = median(pilot_cvg(ii,find(pilot_cvg(ii,:) > median_pilot_cvg(ii)))')'; - end - - % take things into log space - - log_mean_power = log(1+mean_power); - log_mean_production_cvg = log(1+mean_production_cvg); - log_mean_pilot_cvg = log(1+mean_pilot_cvg); - log_median_power = log(1+median_power); - log_median_production_cvg = log(1+median_production_cvg); - log_median_pilot_cvg = log(1+median_pilot_cvg); - log_q1_power = log(1+power_quartile1); - log_q3_power = log(1+power_quartile3); - log_q1_production_cvg = log(1+depth_prod_quartile1); - log_q3_production_cvg = log(1+depth_prod_quartile3); - log_q1_pilot_cvg = log(1+depth_pilot_quartile1); - log_q3_pilot_cvg = log(1+depth_pilot_quartile3); - - % get amplicon start positions - - amp_start = target_region(find(amplicon_start==1)); - - % make plots - - grey = [0.7,0.7,0.7]; - - h0 = figure; - plot(target_region,mean_power,'r'), hold on - plot(target_region,median_power,'k'), hold on - plot(target_region,power_quartile1, 'color', grey), hold on - plot(target_region, power_quartile3, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Power - Production') - - h1 = figure; - plot(target_region, mean_pilot_cvg, 'r'), hold on - plot(target_region, median_pilot_cvg, 'k'), hold on - plot(target_region, depth_pilot_quartile1, 'color', grey), hold on - plot(target_region, depth_pilot_quartile3, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Coverage - Pilot') - - h2 = figure; - plot(target_region, mean_production_cvg, 'r'), hold on - plot(target_region, median_production_cvg, 'k'), hold on - plot(target_region, depth_prod_quartile1, 'color', grey), hold on - plot(target_region, depth_prod_quartile3, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Coverage - Production') - - h3 = figure; - plot(target_region,log_mean_power,'r'), hold on - plot(target_region,log_median_power,'k'), hold on - plot(target_region,log_q1_power, 'color', grey), hold on - plot(target_region, log_q3_power, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Log power - Production') - - h4 = figure; - plot(target_region, log_mean_pilot_cvg, 'r'), hold on - plot(target_region, log_median_pilot_cvg, 'k'), hold on - plot(target_region, log_q1_pilot_cvg, 'color', grey), hold on - plot(target_region, log_q3_pilot_cvg, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Log coverage - Pilot') - - h5 = figure; - plot(target_region, log_mean_production_cvg, 'r'), hold on - plot(target_region, log_median_production_cvg, 'k'), hold on - plot(target_region, log_q1_production_cvg, 'color', grey), hold on - plot(target_region, log_q3_production_cvg, 'color', grey), hold off - set(gca,'xtick', amp_start) - title('Log coverage - Production') - -end \ No newline at end of file diff --git a/matlab/make_gene_coverage_box_plot.m b/matlab/make_gene_coverage_box_plot.m deleted file mode 100755 index e626442d6..000000000 --- a/matlab/make_gene_coverage_box_plot.m +++ /dev/null @@ -1,69 +0,0 @@ -close all -%load mean_coverage_per_exon_per_pool.mat <-- doesn't work from -%/sting/matlab -- but this does - -imp = importdata('/humgen/gsa-hphome1/projects/FHS/results/production_averageDoCByExonAndGene.txt'); -data = imp.data; -textdata = imp.textdata; - -pilot_pos = ismember(textdata(:,2),'CEPH3')+ismember(textdata(:,2),'CEPH2')+ismember(textdata(:,2),'CEPH1'); -pilot_data = data( find(pilot_pos == 1), : ); -prod_data = data( find(pilot_pos == 0) , :); -pilot_text = textdata( find ( pilot_pos == 1) , : ); -prod_text = textdata( find ( pilot_pos == 0 ), : ); - -lim = length(data); - -% get unique names - -gnames = [textdata(1,3)]; -prevn = 'hello doofus'; -for ii = 2 : lim/23 % first 1/23 of the file contains all the gene names (rest are repeats) - n = textdata{ii,3}; - if ( length(prevn) == length(n) && prevn(1)==n(1) && prevn(end)==n(end) ) - % quick test to check if gene is same as previous - else - if ( ~ ismember(gnames,n) ) - gnames = [gnames; textdata(ii,3)]; - % more exhaustive test to see if gene name is novel - end - prevn = n; - end -end - -% plot the groups -num_genes = size(gnames) % yes we want to print this - -to_plot = 40; -plotno = 0; -filenamebaseprod = 'FHS_production_gene_cvg_boxplot_'; -filenamebasepilot = 'FHS_pilot_gene_cvg_boxplot_'; -for genes = 1 : to_plot : num_genes - plotno = plotno + 1; - pilot_positions = []; - prod_positions = []; - for g = genes:genes+to_plot - %n = gnames{g,1} - if ( g < length(gnames) ) - pilot_positions = [pilot_positions; find(ismember(pilot_text(:,3),gnames{g,1}) == 1)]; - prod_positions = [prod_positions; find(ismember(prod_text(:,3),gnames{g,1}) == 1 )]; - end - end - depths_prod = prod_data(prod_positions,2); - depths_pilot = pilot_data(pilot_positions,2); - grenes_prod = prod_text(prod_positions,3); - grenes_pilot = pilot_text(pilot_positions,3); - h = figure - hp = subplot(1,1,1) - boxplot(depths_prod,grenes_prod,'plotstyle','compact','orientation','vertical','datalim',[0,10000],'whisker',0,'symbol','r+'), title('Production') - set(hp,'YLim',[-1000,11000]) - y = figure - yp = subplot(1,1,1) - boxplot(depths_pilot,grenes_pilot,'plotstyle','compact','orientation','vertical','datalim',[0,10000],'whisker',0,'symbol','r+'), title('Pilot') - set(yp,'YLim',[-1000,11000]) - - % -- uncomment these lines to save the files -- - %saveas(h,strcat(filenamebaseprod,num2str(plotno)),'psc2'); - %saveas(y,strcat(filenamebasepilot,num2str(plotno)),'psc2'); -end - diff --git a/perl/AnnotateVCFwithMAF.pl b/perl/AnnotateVCFwithMAF.pl deleted file mode 100755 index 89dda2de0..000000000 --- a/perl/AnnotateVCFwithMAF.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Data::Dumper; -use File::Basename; - -sub usage { - print "Usage: $0 \n"; - print "This program takes an annotated MAF file and propagates the annotations to the VCF file.\n"; - exit(1); -} - -my %args; -($args{'VCF_IN'}, $args{'MAF_IN'}) = @ARGV; - -if (!defined($args{'VCF_IN'}) || !defined($args{'MAF_IN'})) { - &usage(); -} - -($args{'VCF_OUT'} = basename($args{'VCF_IN'})) =~ s/\.vcf/.maf_annotated.vcf/; - -my %ignoreEntries = ( - 'normal_barcode' => 1, - 'tumor_barcode' => 1, - 'build' => 1, - 'tum_allele1' => 1, - 'ref_allele' => 1, - 'tum_allele2' => 1, - 'start' => 1, - 'end' => 1, -); - -my %maf; -open(MAF_IN, $args{'MAF_IN'}); - -chomp(my $mafheader = ); -chomp($mafheader); -my @mafheader = split(/\s+/, $mafheader); - -while (my $mafline = ) { - chomp($mafline); - my @mafline = split(/\s+/, $mafline); - - my %mafentry; - for (my $i = 0; $i <= $#mafheader; $i++) { - $mafentry{$mafheader[$i]} = $mafline[$i]; - } - - my $locus = "$mafentry{'chr'}:$mafentry{'start'}"; - if (exists($mafentry{$locus})) { - print "Locus $locus already spoken for!\n"; - exit(1); - } - - $maf{$locus} = \%mafentry; -} -close(MAF_IN); - -open(VCF_OUT, ">$args{'VCF_OUT'}"); -open(VCF_IN, $args{'VCF_IN'}); - -my @vcfheader; -while (my $vcfline = ) { - chomp($vcfline); - - if ($vcfline =~ /##/) { - print VCF_OUT "$vcfline\n"; - } elsif ($vcfline =~ /#CHROM/) { - - print VCF_OUT "##source=AnnotateVCFwithMAF\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "##INFO=\n"; - print VCF_OUT "$vcfline\n"; - - $vcfline =~ s/#//g; - @vcfheader = split(/\s+/, $vcfline); - } else { - my @vcfentry = split(/\s+/, $vcfline); - my %vcfentry; - - for (my $i = 0; $i <= $#vcfheader; $i++) { - $vcfentry{$vcfheader[$i]} = $vcfentry[$i]; - } - - my $locus = "$vcfentry{'CHROM'}:$vcfentry{'POS'}"; - if (!exists($maf{$locus})) { - for (my $i = 1; $i <= 10; $i++) { - $locus = "$vcfentry{'CHROM'}:" . ($vcfentry{'POS'}-$i); - - last if (exists($maf{$locus})); - } - } - my %mafentry = %{$maf{$locus}}; - - my @info = split(/;/, $vcfentry{'INFO'}); - my %info; - foreach my $info (@info) { - my ($key, $value) = split(/=/, $info); - - $info{$key} = $value; - } - - foreach my $mafkey (sort { $a cmp $b } keys(%mafentry)) { - if (!$ignoreEntries{$mafkey}) { - $info{$mafkey} = $mafentry{$mafkey}; - } - } - - my @newinfo; - foreach my $infokey (sort { $a cmp $b } keys(%info)) { - if (!defined($info{$infokey})) { - #print "$infokey is missing\n"; - } else { - push(@newinfo, "$infokey=$info{$infokey}"); - } - } - $vcfentry{'INFO'} = join(";", @newinfo); - - my @newvcfline; - for (my $i = 0; $i <= $#vcfheader; $i++) { - push(@newvcfline, $vcfentry{$vcfheader[$i]}); - } - - print VCF_OUT join("\t", @newvcfline) . "\n"; - } -} - -close(VCF_IN); -close(VCF_OUT); diff --git a/perl/DataTable.pm b/perl/DataTable.pm deleted file mode 100644 index 136cb3920..000000000 --- a/perl/DataTable.pm +++ /dev/null @@ -1,64 +0,0 @@ -package DataTable; - -use strict; -use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); -require Exporter; - -@ISA = qw(Exporter AutoLoader); - -@EXPORT = qw( readTable ); - -use Data::Dumper; - -sub readTable { - my ($file, %overrides) = @_; - my %options = ( - 'file' => $file, - 'key' => undef, - 'header' => 0, - 'delimiter' => '\s+', - 'filter' => '#', - %overrides, - ); - - open(FILE, $options{'file'}); - - my @header = undef; - if ($options{'header'} == 1) { - chomp(my $headerline = ); - $headerline =~ s/#//g; - @header = split(/$options{'delimiter'}/, $headerline); - } elsif (ref($options{'header'}) eq 'ARRAY') { - @header = @{$options{'header'}}; - } - - chomp(my @lines = grep { $_ !~ /$options{'filter'}/ } ); - - my @table; - my %table; - for (my $lineIndex = 0; $lineIndex <= $#lines; $lineIndex++) { - my $line = $lines[$lineIndex]; - - my %fieldHash; - my @fields = split(/$options{'delimiter'}/, $line); - - for (my $fieldIndex = 0; $fieldIndex <= $#fields; $fieldIndex++) { - $fieldHash{$header[$fieldIndex]} = $fields[$fieldIndex]; - } - - $fieldHash{'_linenum'} = $lineIndex; - $fieldHash{'_line'} = $line; - - if (!defined($options{'key'})) { - push(@table, \%fieldHash); - } else { - my $key = ($options{'key'} =~ /^\d+/) ? $fields[$options{'key'}] : $fieldHash{$options{'key'}}; - - push(@{$table{$key}}, \%fieldHash); - } - } - - return (!defined($options{'key'})) ? @table : %table; -} - -1; diff --git a/perl/DistributedMake.pm b/perl/DistributedMake.pm deleted file mode 100644 index dec290101..000000000 --- a/perl/DistributedMake.pm +++ /dev/null @@ -1,177 +0,0 @@ -package DistributedMake; - -use strict; -use File::Temp qw/ tempfile tempdir /; -use File::Basename; - -sub parseHostsString { - my ($hoststring) = @_; - - if ($hoststring !~ /\s+\+\s+/) { - return undef; - } - - my @hostobjs = split(/\s+\+\s+/, $hoststring); - - my @hosts; - foreach my $hostobj (@hostobjs) { - my ($multiplier, $server) = $hostobj =~ /(\d+)\*(\w+)/; - for (my $i = 0; $i < $multiplier; $i++) { - push(@hosts, $server); - } - } - - return \@hosts; -} - -sub new { - my ($class, %args) = @_; - - my %self = ( - 'dryRun' => 1, - 'numJobs' => undef, - 'keepGoing' => 0, - 'alwaysMake' => 0, - 'debugging' => 0, - 'ignoreErrors' => 0, - 'printDirectory' => 0, - 'unlink' => 1, - 'hosts' => "", - - 'queue' => undef, - 'memLimit' => 2, - 'outputFile' => 'distributedmake.log', - 'mailTo' => 'crd-lsf@broad.mit.edu', - 'wait' => 1, - 'rerunnable' => 0, - 'migrationThreshold' => undef, - 'extra' => '', - - 'target' => 'all', - - %args, - - 'targets' => [], - 'hostindex' => 0, - ); - - $self{'makefile'} = new File::Temp(TEMPLATE => "/tmp/DistributedMake_XXXXXX", SUFFIX => ".makefile", UNLINK => $self{'unlink'}), - $self{'hostarray'} = &parseHostsString($self{'hosts'}); - $self{'projectName'} = basename($self{'makefile'}); - - bless \%self, $class; - - return \%self; -} - -sub addRule { - my ($self, $targetsref, $dependenciesref, $cmdsref, %batchjoboverrides) = @_; - my @targets = (ref($targetsref) eq 'ARRAY') ? @$targetsref : ( $targetsref ); - my @dependencies = (ref($dependenciesref) eq 'ARRAY') ? @$dependenciesref : ( $dependenciesref ); - my @cmds = (ref($cmdsref) eq 'ARRAY') ? @$cmdsref : ( $cmdsref ); - - my @prepcmds; - - my $cmdprefix = ""; - if (defined($self->{'hostarray'})) { - $cmdprefix = "ssh ${$self->{'hostarray'}}[$self->{'hostindex'}] "; - - $self->{'hostindex'}++; - if ($self->{'hostindex'} == scalar(@{$self->{'hostarray'}}) - 1) { - $self->{'hostindex'} = 0; - } - } elsif ((defined($self->{'queue'}) && $self->{'queue'} ne '' && (exists($batchjoboverrides{'queue'}) ? defined($batchjoboverrides{'queue'}) && $batchjoboverrides{'queue'} ne '' : 1)) || (exists($batchjoboverrides{'queue'}) && defined($batchjoboverrides{'queue'}) && $batchjoboverrides{'queue'} ne '')) { - my %bja = ( - 'queue' => $self->{'queue'}, - 'memLimit' => $self->{'memLimit'}, - 'projectName' => $self->{'projectName'}, - 'outputFile' => $self->{'outputFile'}, - 'mailTo' => $self->{'mailTo'}, - 'wait' => $self->{'wait'}, - 'rerunnable' => $self->{'rerunnable'}, - 'migrationThreshold' => $self->{'migrationThreshold'}, - 'extra' => $self->{'extra'}, - %batchjoboverrides, - ); - - my $rerunnable = $bja{'rerunnable'} ? "-r" : ""; - my $migrationThreshold = $bja{'rerunnable'} && defined($bja{'migrationThreshold'}) ? "-mig $bja{'migrationThreshold'}" : ""; - my $wait = $bja{'wait'} ? "-K" : ""; - - my $logdir = dirname($bja{'outputFile'}); - if (!-e $logdir) { - my $mklogdircmd = "\@test \"! -d $logdir\" && mkdir -p $logdir"; - push(@prepcmds, $mklogdircmd); - } - - my $memRequest = $bja{'memLimit'} * 1.5; - my $integerMemRequest = int($memRequest); - my $memCutoff = $bja{'memLimit'} * 1024 * 1024 * 1.25; - - # A quick check to make sure that java commands being dispatched to the farm are instructed to run under a default memory limit - for (my $i = 0; $i <= $#cmds; $i++) { - if ($cmds[$i] =~ /^java / && $cmds[$i] =~ / -jar / && $cmds[$i] !~ / -Xmx/) { - $cmds[$i] =~ s/^java /java -Xmx$bja{'memLimit'}g /; - } - } - - $cmdprefix = "bsub -q $bja{'queue'} -M $memCutoff -P $bja{'projectName'} -o $bja{'outputFile'} -u $bja{'mailTo'} -R \"rusage[mem=$integerMemRequest]\" $wait $rerunnable $migrationThreshold $bja{'extra'} "; - } - - my $rootdir = dirname($targets[0]); - if (!-e $rootdir) { - my $mkdircmd = "\@test \"! -d $rootdir\" && mkdir -p $rootdir"; - push(@prepcmds, $mkdircmd); - } - - # We have to touch the final file just in case the time between different nodes on the farm are not synchronized. - print { $self->{'makefile'} } "$targets[0]: " . join(" ", @dependencies) . "\n\t" . join("\n\t", @prepcmds) . "\n\t$cmdprefix" . join("\n\t$cmdprefix", @cmds) . "\n\ttouch -c $targets[0]\n\n\n"; - - push(@{$self->{'targets'}}, $targets[0]); -} - -sub execute { - my ($self, %overrides) = @_; - - print { $self->{'makefile'} } "all: " . join(" ", @{$self->{'targets'}}) . "\n\n"; - print { $self->{'makefile'} } ".DELETE_ON_ERROR:\n"; - - my %makeargs = ( - 'dryRun' => $self->{'dryRun'}, - 'numJobs' => $self->{'numJobs'}, - 'keepGoing' => $self->{'keepGoing'}, - 'alwaysMake' => $self->{'alwaysMake'}, - 'debugging' => $self->{'debugging'}, - 'ignoreErrors' => $self->{'ignoreErrors'}, - 'printDirectory' => $self->{'printDirectory'}, - 'target' => $self->{'target'}, - %overrides, - ); - - my $numjobs = $makeargs{'numJobs'}; - if (!defined($numjobs)) { - if (defined($self->{'hostarray'}) && scalar($self->{'hostarray'}) > 0) { - $numjobs = scalar(@{$self->{'hostarray'}}); - } else { - $numjobs = 1; - } - } - - my $makecmd = "make" . - ($makeargs{'dryRun'} ? " -n" : "") . - ($makeargs{'keepGoing'} ? " -k" : "") . - ($makeargs{'alwaysMake'} ? " -B" : "") . - ($makeargs{'ignoreErrors'} ? " -i" : "") . - ($makeargs{'printDirectory'} ? " -w" : "") . - ($makeargs{'debugging'} =~ /[abvijm]+/ ? " --debug=$makeargs{'debugging'}" : "") . - ($makeargs{'debugging'} =~ /\d+/ && $makeargs{'debugging'} == 1 ? " -d" : "") . - " -j $numjobs" . - " -f " . $self->{'makefile'}->filename . - " $makeargs{'target'}"; - - print "$makecmd\n"; - system($makecmd); - print "$makecmd\n"; -} - -1; diff --git a/perl/MergeAndEvaluateVCFs.pl b/perl/MergeAndEvaluateVCFs.pl deleted file mode 100755 index e5ad4cf6d..000000000 --- a/perl/MergeAndEvaluateVCFs.pl +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub getArgs { - my %doc = ( - 'GATK' => "Path to GenomeAnalysisTK.jar (usually in your Sting/dist/ directory)", - 'VARIANTREPORT' => "Path to VariantReport.R (usually in your Sting/R/VariantReport/ directory)", - 'REFERENCE' => "Path to human reference sequence", - 'DBSNP' => "Path to dbsnp rod", - 'VCF1' => "Path to first VCF file", - 'VCF1NAME' => "Name of first VCF file", - 'VCF2' => "Path to second VCF file", - 'VCF2NAME' => "Name of second VCF file", - 'VCFOUT' => "Output root for resultant merged VCF, the eval file, and PDF", - ); - - my %args = ( - 'GATK' => undef, - 'VARIANTREPORT' => undef, - 'REFERENCE' => undef, - 'DBSNP' => undef, - 'VCF1' => undef, - 'VCF1NAME' => undef, - 'VCF2' => undef, - 'VCF2NAME' => undef, - 'VCFOUT' => undef, - ); - - my $result = GetOptions( - "GATK=s" => \$args{'GATK'}, - "VARIANTREPORT=s" => \$args{'VARIANTREPORT'}, - "REFERENCE=s" => \$args{'REFERENCE'}, - "DBSNP=s" => \$args{'DBSNP'}, - "VCF1=s" => \$args{'VCF1'}, - "VCF1NAME=s" => \$args{'VCF1NAME'}, - "VCF2=s" => \$args{'VCF2'}, - "VCF2NAME=s" => \$args{'VCF2NAME'}, - "VCFOUT=s" => \$args{'VCFOUT'}, - ); - - my @undefinedArgs; - foreach my $key (sort { $a cmp $b } keys(%args)) { - if (!defined($args{$key})) { - push(@undefinedArgs, $key); - } - } - - if (scalar(@undefinedArgs) > 0) { - print "Error: there were some undefined arguments:\n"; - - foreach my $undefinedArg (@undefinedArgs) { - print "\t-$undefinedArg : $doc{$undefinedArg}\n"; - } - - exit(-1); - } - - return %args; -} - -sub runCommand { - my ($output, $cmd) = @_; - - #if (!-e $output) { - system($cmd); - #} -} - -my %args = &getArgs(); - -my $combineVariantsCmd = "java -jar $args{'GATK'} -T CombineVariants -R $args{'REFERENCE'} -B:$args{'VCF1NAME'},VCF $args{'VCF1'} -B:$args{'VCF2NAME'},VCF $args{'VCF2'} -variantMergeOptions UNION -priority $args{'VCF1NAME'},$args{'VCF2NAME'} -o $args{'VCFOUT'} -l INFO"; -&runCommand($args{'VCFOUT'}, $combineVariantsCmd); - -my $variantEvalCmd = "java -jar $args{'GATK'} -T VariantEval -R $args{'REFERENCE'} -D $args{'DBSNP'} -B:eval,VCF $args{'VCFOUT'} -select 'set==\"Intersection\"' -selectName Intersection -select 'set==\"$args{'VCF1NAME'}\"' -selectName $args{'VCF1NAME'} -select 'set==\"filterIn$args{'VCF1NAME'}-$args{'VCF2NAME'}\"' -selectName In$args{'VCF2NAME'}-FilteredIn$args{'VCF1NAME'} -select 'set==\"$args{'VCF2NAME'}\"' -selectName $args{'VCF2NAME'} -select 'set==\"filterIn$args{'VCF2NAME'}-$args{'VCF1NAME'}\"' -selectName In$args{'VCF1NAME'}-FilteredIn$args{'VCF2NAME'} -select 'set==\"FilteredInAll\"' -selectName FilteredInAll -reportType R -reportLocation $args{'VCFOUT'}.eval"; -&runCommand("$args{'VCFOUT'}.eval", $variantEvalCmd); - -my $variantReportCmd = "Rscript $args{'VARIANTREPORT'} -title 'Automated Variant Report' -author '$ENV{'USER'}' -evalRoot $args{'VCFOUT'}.eval -plotOut $args{'VCFOUT'}.eval.pdf"; -&runCommand("$args{'VCFOUT'}.eval.pdf", $variantReportCmd); diff --git a/perl/StingArgs.pm b/perl/StingArgs.pm deleted file mode 100644 index 9dd732e07..000000000 --- a/perl/StingArgs.pm +++ /dev/null @@ -1,305 +0,0 @@ -package StingArgs; - -use strict; -use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); -use Term::ANSIColor qw(:constants); - -require Exporter; - -@ISA = qw(Exporter AutoLoader); - -@EXPORT = qw( getCommandArguments printCommandArguments moduleArguments PrintCommandHeader ); - -sub _getFormattingCharacterMap { - my %fcmap = ( - 'section' => '', - 'arg' => '', - 'default' => '', - 'end' => '' - ); - - if (defined($ENV{'ARACHNE_PRETTY_HELP'})) { - if ($ENV{'ARACHNE_PRETTY_HELP'} eq 'Color') { - $fcmap{'section'} = (RED . BOLD); - $fcmap{'arg'} = (MAGENTA . BOLD); - $fcmap{'default'} = BLUE; - $fcmap{'end'} = RESET; - } elsif ($ENV{'ARACHNE_PRETTY_HELP'} eq 'Bold') { - $fcmap{'section'} = BOLD; - $fcmap{'arg'} = BOLD; - $fcmap{'arg'} = BOLD; - $fcmap{'default'} = ""; - $fcmap{'end'} = RESET; - } - } - - return %fcmap; -} - -sub _usage { - my ($requiredArgsRef, $helpRef) = @_; - my %requiredArgs = %$requiredArgsRef; - my %help = (defined($helpRef)) ? %$helpRef : (); - my %optionalArgs; - my %fcmap = &_getFormattingCharacterMap(); - - print "\n$fcmap{'section'}Usage: $0 arg1=value1 arg2=value2 ...$fcmap{'end'}\n\n"; - - print "$fcmap{'section'}Required arguments:$fcmap{'end'}\n\n"; - - foreach my $key (sort { $a cmp $b } keys(%requiredArgs)) { - next if ($key =~ /_postprocess/ || $key =~ /_preprocess/); - if (defined($requiredArgs{$key})) { $optionalArgs{$key} = $requiredArgs{$key}; } - else { - print "$fcmap{'arg'}$key$fcmap{'end'}\n"; - - if (defined($help{$key})) { - print " $help{$key}\n"; - } - } - } - print "\n"; - - return unless keys(%optionalArgs); - - print "$fcmap{'section'}Optional arguments:$fcmap{'end'}\n\n"; - - foreach my $key (sort { $a cmp $b } keys(%optionalArgs)) { - if (defined($requiredArgs{$key})) { - print "$fcmap{'arg'}$key$fcmap{'end'} $fcmap{'default'}default: " . ((ref($requiredArgs{$key}) eq 'ARRAY') ? "\"{" . join(",", @{$requiredArgs{$key}}) . "}\"" : $requiredArgs{$key}) . "$fcmap{'end'}\n"; - - if (defined($help{$key})) { - print " $help{$key}\n"; - } - } - } - print "\n"; -} - -# Parse the command-line arguments in Arachne style (including the ability to have whitespace between an equal sign and the parameter. -sub getCommandArguments { - my %requiredArgs = @_; - my %help; - - # Clean up our required arguments - foreach my $key (keys(%requiredArgs)) { - if (ref($requiredArgs{$key}) eq 'HASH') { - $help{$key} = ${$requiredArgs{$key}}{'help'}; - $requiredArgs{$key} = ${$requiredArgs{$key}}{'value'}; - } - if (defined($requiredArgs{$key})) { - $requiredArgs{$key} =~ s/[\r\n]//g; - } - } - - # Set our required argument defaults. - my %args = ( - 'NO_HEADER' => 0, - %requiredArgs, - ); - - if (defined($requiredArgs{'NH'}) && $requiredArgs{'NH'} =~ /(True|true|1)/) { - $args{'NO_HEADER'} = 1; - delete($args{'NH'}); - delete($requiredArgs{'NH'}); - } - - # Print usage and exit if we're not supplied with any arguments. - if ($#ARGV == -1) { - if (defined($requiredArgs{'_usage'})) { &{$requiredArgs{'_usage'}}(\%requiredArgs, \%help); } - else { &_usage(\%requiredArgs, \%help); } - exit(-1); - } - - # Clean up the command-line arguments so that we can accept arguments with spaces and things like 'KEY= VALUE' in addition to the normal 'KEY=VALUE'. - for (my $i = 0; $i <= $#ARGV; $i++) { - my $arg = $ARGV[$i]; - if ($arg =~ /\w+=$/) { - until (($i+1) > $#ARGV || $ARGV[$i+1] =~ /\w+=/) { $arg .= $ARGV[++$i]; } - } - - if ($arg =~ /(NO_HEADER|NH)=(\s)?(True|true|1)/) { # Turn off automatic banner - $args{'NO_HEADER'} = 1; - } elsif ($arg =~ /(.+)=(.+)/) { - my ($key, $value) = ($1, $2); - - # Store arguments that are of no interest to us in a separate variable. This makes it convenient to allow certain arguments to pass through to another script by simply appending this extra argument to its command-line. - if (!exists($requiredArgs{$key})) { - $args{'_extra'} .= " $key=\"$value\""; - } - - if ($value eq 'True' || $value eq 'true' || ($value =~ /^\d+$/ && $value == 1)) { $args{$key} = 1; } # Parse boolean values - elsif ($value eq 'False' || $value eq 'false' || ($value =~ /^\d+$/ && $value == 0)) { $args{$key} = 0; } - elsif ($value =~ /{(.+)}/) { # Parse array values - $value = $1; - $value =~ s/\s+//g; - my @values = split(",", $value); - $args{$key} = \@values; - } else { # Parse a regular ol' KEY=VALUE pair - $args{$key} = $value; - } - } elsif ($arg =~ /(.+)=$/) { # Parse a KEY=VALUE pair where VALUE is empty - $args{$1} = ""; - } elsif ($arg =~ /-(h|help|\?)/) { # Print help - if (defined($requiredArgs{'_usage'})) { &{$requiredArgs{'_usage'}}(\%requiredArgs, \%help); } - else { &_usage(\%requiredArgs, \%help); } - - exit(-1); - } - } - - # Pre-process arguments - if (defined($requiredArgs{'_preprocess'})) { - &{$requiredArgs{'_preprocess'}}(\%args); - delete($args{'_preprocess'}); - } - - # Print the header box with info about the command - &PrintCommandHeader() unless ($args{'NO_HEADER'}); - - # Did the user forget any arguments? - my $missingArgs = 0; - foreach my $requiredArg (keys(%requiredArgs)) { - if ($requiredArg !~ /_(pre|post)process/ && !defined($args{$requiredArg})) { - print "$requiredArg must be supplied.\n"; - $missingArgs = 1; - } - } - - if ($missingArgs) { die ("Error: some required arguments were not supplied.\n"); } - - # Post-process arguments - if (defined($requiredArgs{'_postprocess'})) { - &{$requiredArgs{'_postprocess'}}(\%args); - delete($args{'_postprocess'}); - } - - # We're all good! - return %args; -} - -# Print our command arguments -sub printCommandArguments { - my ($argsref) = @_; - my %args = %$argsref; - - foreach my $key (sort { $a cmp $b } keys(%args)) { - if (ref($args{$key}) eq 'ARRAY') { - print "$key => {" . join(",", @{$args{$key}}) . "}\n"; - } else { - print "$key => $args{$key}\n"; - } - } -} - - - -# Returns a hash (by reference) with the required command-line arguments -# for the named C++ module. -# The trick is to run the module with no arguments, prompting the usage message -# (defined in system/ParsedArgs.cc), and then capture and parse this message. -sub moduleArguments($) { - my ($module_name) = @_; - my %args; - my ($key, $value); - - # Temporarily setenv ARACHNE_PRETTY_HELP to "Bold" - to help with parsing - my $temp = $ENV{'ARACHNE_PRETTY_HELP'}; - $ENV{'ARACHNE_PRETTY_HELP'} = "Bold"; - - # Escape character - appears in output when ARACHNE_PRETTY_HELP="Bold" - my $esc = chr(27); - my $optional = 0; - - open (FH, "$module_name |"); - - # Process each line of output into a command-line argument - foreach my $line () { - - $optional = 1 if ($line =~ /Optional arguments/); - - # Match line against the specific stdout format given in ParsedArgs - next unless ($line =~ /$esc\[01m(.+?)$esc\[0m(.+)/); - $key = $1; - - # If an argument is optional, but no value is specified in the usage - # message, this means that the default value is in fact an empty string - $value = $optional ? "" : undef; - - # Look for a default value in this line - if ($2 =~ /default\: (.+)$/) { - $value = $1; - } - - $args{$key} = $value; - } - close FH; - $ENV{'ARACHNE_PRETTY_HELP'} = $temp; - - - - return \%args; -} - - - - -# Print the fancy header box with info about the command, -# including arguments supplied to it -# This parallels the function PrintTheCommandPretty in system/ParsedArgs.cc -sub PrintCommandHeader { - - my ($fh, $prefix, $thickbar) = (*STDOUT, ''); # default values - $fh = $_[0] if ($_[0]); # filehandle to print the header to - $prefix = $_[1] if ($_[1]); # string to be prepended to every line of the header - $thickbar = $_[2] if ($_[2]); # print a thicker version of the bar ('=') - my $width = 80; - - my @stat = stat $0; - my $mtime = localtime($stat[9]); - - my $bar = $thickbar ? '='x$width : '-'x$width; - my $timestamp = localtime() . " run (pid=" . $$ . "), last modified $mtime"; - - my $command = $0; - $command .= ' ' while length $command < $width - 1; - $command .= "\\"; - - # Fill @args_parsed with lines of (parsed) info about the args - my @args = @ARGV; - my @args_parsed = (); - my $line = ' '; - while (@args) { - my $arg = shift @args; - - # Start a new line, if necessary - if (length($line) + 1 + length($arg) >= $width - 1 && - $line ne '') { - $line .= ' ' while length $line < $width - 1; - $line .= "\\"; - push @args_parsed, $line; - $line = ' '; - } - - $line .= "$arg "; - } - if ($line) { - push @args_parsed, $line; - } - - - # We have prepared the output lines; - # now, prepend each line with the prefix, and append a newline - map {$_ = "$prefix$_\n"} ($bar, $timestamp, $command, @args_parsed); - - # Print lines to filehandle - print $fh $bar; - print $fh $timestamp; - print $fh $command; - print $fh @args_parsed; - print $fh $bar; - print $fh "\n"; -} - - -1; diff --git a/perl/batchGATKjobsWithRegExp.pl b/perl/batchGATKjobsWithRegExp.pl deleted file mode 100755 index d17597d99..000000000 --- a/perl/batchGATKjobsWithRegExp.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w - -# Runs a given (GATK) command as specified, except that a single '*' -# in the -I argument field is replaced with the appropriate shell -# expansion and all other instances of the '*' in other arguments are -# replaced with the correct expansion accordingly. -# IMPORTANT: remember to surround your command with quotes (\"\") so -# that the shell doesn't try to expand out your regular expression! - -use strict; -use Getopt::Long; - -my $dry; -my $inArg = "I"; -GetOptions( "dry!" => \$dry, - "inArg=s" => \$inArg); - -if (scalar(@ARGV) != 1) { - print "Usage: batchGATKjobsWithRegExp\n\t[-inArg ]\n\t[-dry]\n\t\"GATK command\"\n"; - exit(1); -} - -my @args = split(/ /, $ARGV[0]); -chomp(@args); - -my $argcount = scalar(@args); -my $IargIdx = undef; -for (my $i = 0; $i < $argcount; $i++) { - if ($args[$i] eq "-$inArg") { - $IargIdx = $i + 1; - } -} -my $Iarg = "$args[$IargIdx]\n"; -if ($Iarg =~ m/(.*),(.*),(.*)/) { - $Iarg = $3; -} -my @matches = glob($Iarg); -$Iarg =~ s/\*/(.*)/; -chomp($Iarg); - -foreach my $match (@matches) { - $match =~ m/$Iarg/; - my $piece = $1; - - my $cmd = ""; - for (my $i = 0; $i < $argcount; $i++) { - my $arg = $args[$i]; - $arg =~ s/\*/$piece/; - $cmd .= "$arg "; - } - - if ($dry) { - print "$cmd\n"; - } else { - system($cmd); - } -} diff --git a/perl/createTranscriptToGenomicInfoTables.pl b/perl/createTranscriptToGenomicInfoTables.pl deleted file mode 100755 index 8787f6ba5..000000000 --- a/perl/createTranscriptToGenomicInfoTables.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -w - -# Runs the TranscriptToGenomicInfo tool to create big tables - -use strict; -use Getopt::Long; - -my $in = undef; -my $gatk = undef; -my $ref = undef; -my $out = undef; -my $tmp = "/tmp"; -GetOptions( "transcript=s" => \$in, - "gatk=s" => \$gatk, - "ref=s" => \$ref, - "out=s" => \$out, - "tmp=s" => \$tmp); - -if ( !$in || !$gatk || !$ref || !$out ) { - print "Usage: createTranscriptToGenomicInfoTables.pl\n\t-transcript \t\n\t-gatk \t\t\n\t-ref \t\t\n\t-out \t\t\n\t-tmp \t\t\n"; - print "Example: ./createTranscriptToGenomicInfoTables.pl\n\t-transcript test.foo\n\t-ref /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-out test.bar\n\t-tmp /broad/shptmp\n"; - exit(1); -} - -# generate a random number -my $random_number = rand(); -my $tmp_prefix = "$tmp/$random_number"; -print "Writing temporary files to prefix: $tmp_prefix\n"; -my $unsorted_table = "$tmp_prefix.unsorted.table"; - -# convert the file -print "Converting the transcript table..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T TranscriptToGenomicInfo -R $ref -B:transcripts,AnnotatorInputTable $in -o $unsorted_table -n name,proteinID"; -system($cmd); - -# we need to sort the converted table now -print "\nRe-sorting the table...\n"; -open(SORTED, ">$out") or die "can't open $out: $!"; - -# write the header -open(UNSORTED, "< $unsorted_table") or die "can't open $unsorted_table: $!"; -my $line = ; -print SORTED "$line"; -close(UNSORTED); -close(SORTED); - -$cmd = "grep haplotypeReference -v $unsorted_table | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $ref.fai >> $out"; -system($cmd); - -# clean up -unlink $unsorted_table; - -print "\nDone!\n"; diff --git a/perl/enqueueGATKcallerJobs.pl b/perl/enqueueGATKcallerJobs.pl deleted file mode 100755 index fc65c6641..000000000 --- a/perl/enqueueGATKcallerJobs.pl +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -my $pilot = "pilot2"; -my $queue = "gsa"; -my $tech = "SLX"; -my $jar = "/humgen/gsa-scr1/ebanks/Sting/dist/GenomeAnalysisTK.jar"; - -GetOptions( "p:s" => \$pilot, - "q:s" => \$queue, - "tech:s" => \$tech, - "j:s" => \$jar ); - -my @samples; -if ($pilot eq "pilot1") { - @samples = ("CEU","YRI","CHB-JPT"); -} elsif ($pilot eq "pilot2") { - @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892"); -} - -foreach my $sample (@samples) { - enqueue($sample, $pilot, $queue, $jar, $tech); -} - -sub enqueue { - - my $sample = $_[0]; - my $pilot = $_[1]; - my $queue = $_[2]; - my $jar = $_[3]; - my $tech = $_[4]; - - my $inputBamStr = ""; - my $outputDir; - if ($pilot eq "pilot2") { - $inputBamStr = "-I /humgen/gsa-hphome1/projects/1kg_pilot2/useTheseBamsForAnalyses/$sample.$tech.bam"; - $outputDir = "/broad/hptmp/ebanks/1kg_pilot2/cleaned/calls"; - } else { - my $num = 1; - while ($num < 23) { - $inputBamStr .= "-I /broad/hptmp/ebanks/1kg_pilot1/cleaned/bams/$sample.chr$num.$tech.bam "; - $num++; - } - $inputBamStr .= "-I /broad/hptmp/ebanks/1kg_pilot1/cleaned/bams/$sample.chrX.$tech.bam -I /broad/hptmp/ebanks/1kg_pilot1/cleaned/bams/$sample.chrY.$tech.bam "; - $outputDir = "/broad/hptmp/ebanks/1kg_pilot1/cleaned/calls"; - } - - my $outputFile = "$outputDir/indels/$sample.$tech.low.calls"; - my $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -o $outputFile -minConsensusFraction 0.5 -minFraction 0."; - if ($pilot eq "pilot1") { $cmd .= "0"; } - $cmd .= "1 -minCnt 2 -1kg"; - system($cmd); - - $outputFile = "$outputDir/indels/$sample.$tech.high.calls"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -o $outputFile -minConsensusFraction 0.5 -minFraction 0."; - if ($pilot eq "pilot1") { $cmd .= "0"; } - $cmd .= "3 -minCnt 2 -1kg"; - system($cmd); - - if ($pilot eq "pilot2") { - $outputFile = "$outputDir/unfiltered_snps/$sample.$tech.geli.calls"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -varout $outputFile -lod 5"; - system($cmd); - } -} diff --git a/perl/filterSingleSampleCalls.pl b/perl/filterSingleSampleCalls.pl deleted file mode 100755 index ea92c19b1..000000000 --- a/perl/filterSingleSampleCalls.pl +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/perl - -use Getopt::Long; -use strict; - - -sub usage { - - my $message = shift; - - print "\nERROR:\n$message\n\n"; - - print "Usage:\n\n"; - print " filterSingleSampleCalls --calls FILEIN [--max_cons_av_mm N1] [--max_ref_av_mm N2] [--max_cons_nqs_av_mm N3] [--min_cons_nqs_av_qual N4] [--min_ref_nqs_av_qual N5] [--mode MODE]\n\n"; - print " FILEIN File to read and apply filter to. If \"-\" then read from stdin.\n"; - print " N1 max. average number of mismatches per (consensus) indel-containing read.\n"; - print " If the number is greater then N1, indel will be discarded/marked.\n"; - print " N2 max. average number of mismatches per reference-matching read.\n"; - print " If the number is greater then N2, indel will be discarded/marked.\n"; - print " N3 max. average mismatch rate in NQS window around the indel, across all indel-containing read.\n"; - print " If the number is greater then N3, indel will be discarded/marked.\n"; - print " N4 min. average base quality in all indel supporting reads in the nqs window around the indel.\n"; - print " If the average base quality is less than N4, the indel will be discarded/marked.\n"; - print " N5 min. average base quality in all reference supporting reads in the nqs window around the indel.\n"; - print " If the average base quality is less than N5, the indel will be discarded/marked.\n"; - print " MODE If (any prefix of) ANNOTATE, then indel calls not passing any of the filters will be still printed into stdout with\n"; - print " additional AUTOFILTER_* tags added, specifying what cutoff(s) were not passed.\n"; - print " If (any prefix of) DISCARD, then only indels passing all the filters will be printed into stdout, the rest\n"; - print " will be discarded (default).\n\n"; - - exit(1); - -} - -my $calls = ""; -my $tolerate_old_calls = 1; -my $cons_av_mm_cutoff = 100000; -my $ref_av_mm_cutoff = 100000; -my $cons_av_nqs_mm_cutoff = 100000; -my $ref_nqs_av_qual_cutoff = 0; -my $cons_nqs_av_qual_cutoff = 0; -my $mode_arg = ""; -my $mode = 1; # "discard" - -GetOptions("calls:s" => \$calls, - "max_cons_av_mm:f" => \$cons_av_mm_cutoff, - "max_ref_av_mm:f" => \$ref_av_mm_cutoff, - "max_cons_nqs_av_mm:f" => \$cons_av_nqs_mm_cutoff, - "min_ref_nqs_av_qual:f" => \$ref_nqs_av_qual_cutoff, - "min_cons_nqs_av_qual:f" => \$cons_nqs_av_qual_cutoff, - "mode=s" => \$mode_arg - ) - or usage("Can not parse argument string"); - -usage ("--calls argument must be specified") if ( $calls eq "" ) ; -usage ("--mode argument must be specified (unique prefix of: ANNOTATE or DISCARD)") if ( $mode_arg eq ""); - -if ( "ANNOTATE" =~ /^$mode_arg/ ) { - $mode = 0; -} elsif ( "DISCARD" =~ /^$mode_arg/ ) { - $mode=1; -} else { - die("Unrecognized value specified in --mode argument"); -} - -my $input_stream; - -if ( $calls eq "-" ) { - $input_stream = "STDIN"; -} else { - open ( $input_stream, "< $calls") or - die("Can not open input file $calls: $!"); -} - -my $id_counter = 0; - -while( <$input_stream> ) { - - chomp; - - $id_counter++; - my $annotation=""; - - next if ( $_ =~ /_TOO_DIRTY/ ); - -# print $_,"\n"; -# next; - - my $cons_cnt = $1; - my $indel_cnt = $2; - my $cov = $3; - - if ( $_ =~ /\sOBS_COUNTS\[C\/A\/[RT]\]:(\d+)\/(\d+)\/(\d+)\s/ ) { - $cons_cnt = $1; - $indel_cnt = $2; - $cov = $3; - } else { - if ( $tolerate_old_calls != 0 ) { - print "$_\n"; - next; - } else { - die("NO OBS_COUNTS in\n$_\n"); - } - } - - - die ("NO AV_MM MATCH in\n$_\n") if ( $_ !~ /\sAV_MM\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ; - - my $cons_av_mm = $1; - my $ref_av_mm = $2; - - die("NO AV_MAPQ MATCH in\n$_\n") if ( $_ !~ /\sAV_MAPQ\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ; - - my $cons_av_mapq = $1; - my $ref_av_mapq = $2; - - die("NO NQS_MM_RATE MATCH in\n$_\n") if ( $_ !~ /\sNQS_MM_RATE\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ; - - my $cons_nqs_mm_rate = $1; - my $ref_nqs_mm_rate = $2; - - die( "NO NQS_AV_QUAL MATCH in\n$_\n") if ( $_ !~ /\sNQS_AV_QUAL\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ; - - my $cons_nqs_av_qual = $1; - my $ref_nqs_av_qual = $2; - - - if ( $cons_av_mm < 0 ) { - print STDERR "WARNING: negative mismatch rate in consensus supporting reads:\n"; - print STDERR "$_\n"; - next; - } - if ( $ref_av_mm < 0 ) { - print STDERR "WARNING: negative mismatch rate in reference supporting reads:\n"; - print STDERR "$_\n"; - next; - } - if ( $cons_av_mapq < 0 ) { - print STDERR "WARNING: negative average mapping quality in consensus supporting reads:\n"; - print STDERR "$_\n"; - next; - } - if ( $ref_av_mapq < 0 ) { - print STDERR "WARNING: negative average mapping quality in reference supporting reads:\n"; - print STDERR "$_\n"; - next; - } - - if ( $cons_av_mm > $cons_av_mm_cutoff ) { - # filter indel out: alignments for indel-containing reads are too messy - if ( $mode == 0 ) { # annotate - $annotation .= "\tAUTOFILTER_CONS_AV_MM_$cons_av_mm_cutoff"; - } else { - next; # discard - } - } - - if ( $ref_av_mm > $ref_av_mm_cutoff ) { - # filter indel out: alignments for reference-matching reads are too messy - if ( $mode == 0 ) { # annotate - $annotation .= "\tAUTOFILTER_REF_AV_MM_$ref_av_mm_cutoff"; - } else { - next; # discard - } - } - - - if ( $cons_nqs_av_qual < $cons_nqs_av_qual_cutoff ) { - # filter indel out: alignments for indel-containing reads are too messy - if ( $mode == 0 ) { # annotate - $annotation .= "\tAUTOFILTER_CONS_NQS_AV_QUAL_$cons_nqs_av_qual_cutoff"; - } else { - next; # discard - } - } - - if ( $ref_nqs_av_qual < $ref_nqs_av_qual_cutoff ) { - # filter indel out: alignments for reference-matching reads are too messy - if ( $mode == 0 ) { # annotate - $annotation .= "\tAUTOFILTER_REF_NQS_AV_QUAL_$ref_nqs_av_qual_cutoff"; - } else { - next; # discard - } - } - - - if ( $cons_nqs_mm_rate > $cons_av_nqs_mm_cutoff ) { - # filter indel out: consensus nqs window too messy (probably "strange" indel) - if ( $mode == 0 ) { # annotate - $annotation .= "\tAUTOFILTER_CONS_NQS_MM_$cons_av_nqs_mm_cutoff"; - } else { - next; # discard - } - } - - print "$_$annotation\n"; - - -} - - -close $input_stream if ( $calls ne "-" ); - diff --git a/perl/liftOverVCF.pl b/perl/liftOverVCF.pl deleted file mode 100755 index 7039e111f..000000000 --- a/perl/liftOverVCF.pl +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/perl -w - -# Runs the liftover tool on a VCF and properly handles the output - -use strict; -use Getopt::Long; - -my $in = undef; -my $gatk = undef; -my $chain = undef; -my $newRef = undef; -my $oldRef = undef; -my $out = undef; -my $tmp = "/tmp"; -my $recordOriginalLocation = 0; -GetOptions( "vcf=s" => \$in, - "gatk=s" => \$gatk, - "chain=s" => \$chain, - "newRef=s" => \$newRef, - "oldRef=s" => \$oldRef, - "out=s" => \$out, - "tmp=s" => \$tmp, - "recordOriginalLocation" => \$recordOriginalLocation); - -if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { - print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp \t\t\n\t-recordOriginalLocation \t\t\n"; - print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n"; - exit(1); -} - -# generate a random number -my $random_number = rand(); -my $tmp_prefix = "$tmp/$random_number"; -print "Writing temporary files to prefix: $tmp_prefix\n"; -my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; - -# lift over the file -print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; -if ($recordOriginalLocation) { - $cmd .= " -recordOriginalLocation"; -} -system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying."); - -# we need to sort the lifted over file now -print "\nRe-sorting the vcf...\n"; -my $sorted_vcf = "$tmp_prefix.sorted.vcf"; -open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; - -# write the header -open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; -my $inHeader = 1; -while ( $inHeader == 1 ) { - my $line = ; - if ( $line !~ m/^#/ ) { - $inHeader = 0; - } else { - print SORTED "$line"; - } -} -close(UNSORTED); -close(SORTED); - -$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf"; -system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying."); - -# Filter the VCF for bad records -print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; -system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); - -# clean up -unlink $unsorted_vcf; -unlink $sorted_vcf; -my $sorted_index = "$sorted_vcf.idx"; -unlink $sorted_index; - -print "\nDone!\n"; - -sub quit { - print "\n$_[0]\n"; - exit(1); -} diff --git a/perl/maf_annotation/annotate_single_maf.pl b/perl/maf_annotation/annotate_single_maf.pl deleted file mode 100755 index e9364e3e3..000000000 --- a/perl/maf_annotation/annotate_single_maf.pl +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env perl -use warnings; -use strict; - -if (scalar(@ARGV) != 1) { - die("Usage: annotate_single_maf.pl \n"); -} - -my ($maf) = @ARGV; - -# TODO: Have a common checkout of https://svn/CancerGenomeAnalysis -# or a compiled version of this matlab program like Firehose uses. -my $cmd = "matlab <\n"); -} - -my ($normalAlias) = @ARGV; -my $line; -while (defined($line = )) { - if ($line =~ /#/) { next; } - - my ($chrom, $pos, $rsName, $ref, $alt) = split("\t", $line); - - my $ncbiBuild = "36"; - print join("\t", ($ncbiBuild, $chrom, $pos, $pos, $ref, $alt, $alt, $normalAlias, $normalAlias)) . "\n"; -} diff --git a/perl/randomSampleFromStream.pl b/perl/randomSampleFromStream.pl deleted file mode 100755 index d39d6c961..000000000 --- a/perl/randomSampleFromStream.pl +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - print "\nUsage: randomSampleFromStream.pl [-N size] [file1 file2 ...]\n\n"; - print " Selects a random sample of 'size' elements, without replacement,\n"; - print " from dataset that represents a union of all input steams. If no\n"; - print " input streams are specified, then reads STDIN. This script implements\n"; - print " the standard reservoir sampling algorithm, i.e. it does not preload\n"; - print " the data into memory and performs selection in one pass.\n\n"; - print " -N size : optional (default=1), size of the random sample to select.\n\n"; - - exit(0); -} - - - -my @selectedLines; # the line we are going to print at the end -my $sampleSize = 1; - -my @streams; -my $curr_stream; - -sub nextLine { - my $line = <$curr_stream>; - - return $line if ( $line ) ; - - if ( $curr_stream ne "STDIN" && scalar @streams > 0 ) { - # we are done with the current stream: try opening next one - until ( $line ) { - close $curr_stream; - if ( scalar @streams > 0 ) { - my $fname = shift @streams; - open($curr_stream, "< $fname") or - die("Can not open input file $fname"); - $line = <$curr_stream>; - } else { - last; # no more streams left - } - } - } - return $line; -} - -my $help = 0; -GetOptions( "N:s" => \$sampleSize, - "h" => \$help ) or usage(); - -usage() if ( $help ) ; - -if ( scalar(@ARGV) == 0 ) { - $curr_stream = "STDIN"; -} else { - my $fname = shift @ARGV; - open($curr_stream, "< $fname") or - die("Can not open input file $fname"); - push @streams, @ARGV; -} - - -my $line; - -for ( my $i = 0 ; $i < $sampleSize; $i++ ) { - $line = nextLine(); - if ( $line ) { - push @selectedLines, $line; - } else { - # no more lines in the input stream(s)! we got less than sampleSize so far! - $sampleSize = $i ; # reset sampleSize to the actual number of lines available - last; - } -} - - -$line = nextLine() if ( $line ) ; # if no more lines left, do not attempt to read -my $index = 0; # where to insert line if selected - -my $counter = $sampleSize; # total number of lines read - -while ( $line ) { - $counter++; - - my $prob = $sampleSize/$counter; - - if ( rand() <= $prob ) { - # line gets selected - $index = int ( rand ( $sampleSize ) ) if ( $sampleSize > 1 ); # choose where to insert - $selectedLines[$index] = $line; # replace old value with newly selected line - } - $line = nextLine(); -} - - -for ( my $i = 0 ; $i < $sampleSize ; $i++ ) { print $selectedLines[$i]; } - diff --git a/perl/runPipelineSanityCheck.pl b/perl/runPipelineSanityCheck.pl deleted file mode 100755 index f50758ab8..000000000 --- a/perl/runPipelineSanityCheck.pl +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; -use Data::Dumper; -use File::Path; - -my @log; - -sub getTimestamp { - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); - my $timestamp = sprintf("%04d.%02d.%02d", 1900 + $year, $mon, $mday); - - return $timestamp; -} - -sub log { - my ($message) = @_; - - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); - my $timestamp = sprintf("[%04d.%02d.%02d %02d:%02d:%02d]", 1900 + $year, $mon, $mday, $hour, $min, $sec); - - my @message = split(/\n/, $message); - for (my $i = 0; $i <= $#message; $i++) { - $message[$i] = "$timestamp $message[$i]"; - } - - print STDOUT join("\n", @message) . "\n"; - - push(@log, @message); -} - -sub emailLog { - my ($subject, $logfile, $email) = @_; - - my $cmd = "mail -s 'Cron $subject' $email < $logfile"; - system($cmd); -} - -sub killPipeline { - chomp(my @jobs = qx(bjobs -P QueuePipelineTest)); - - my @ids; - foreach my $job (@jobs) { - if ($job =~ /^(\d+)\s+/) { - push(@ids, $1); - } - } - - system("bkill " . join(" ", @ids)); -} - -sub execute { - my ($command, $output, $timeout, $run, $logfile, $email) = @_; - - my $status = 0; - my $result = "(output not generated in dry-run mode)"; - - &log("Execute: '$command'"); - - if ($run == 1) { - eval { - local $SIG{'ALRM'} = sub { die("alarm\n") }; - alarm($timeout); - - $result = qx($command 2>&1); - $status = $?; - - alarm(0); - }; - - if ($@) { - &log("Timeout: '$command' did not return within '$timeout' seconds"); - - if ($run == 1) { - open(LOG, ">$logfile"); - print LOG join("\n", @log); - close(LOG); - - &emailLog("Analysis pipeline nightly test: failed", $logfile, $email); - &killPipeline(); - } - - exit($status); - } - } - - if ($status == 0) { - if ($output == 0) { - &log("Success: '$command' exited with code '$status'"); - } else { - &log("Success: '$command' exited with code '$status' Output:"); - &log($result); - } - - return 0; - } - - &log("Failure: '$command' exited with code '$status' Output:"); - - if (!defined($result)) { - $result = "(unable to capture)\n"; - } - - &log($result); - - if ($run == 1) { - open(LOG, ">$logfile"); - print LOG join("\n", @log); - close(LOG); - - &emailLog("Analysis pipeline nightly test: failed", $logfile, $email); - &killPipeline(); - } - - exit($status); -} - -# Default arguments -my %args = ( - 'pipelineRoot' => "/humgen/gsa-hpprojects/analysis/pipeline", - 'outputRoot' => "/humgen/gsa-hpprojects/analysis/pipeline/projects/QueuePipelineNightlyTest", - 'testRepositoryDir' => "/humgen/gsa-hpprojects/analysis/pipeline/repositories/StingTest", - 'testInputYAML' => "/humgen/gsa-hpprojects/GATK/data/Validation_Data/QueuePipelineTestData/QueuePipelineTestData.yaml", - 'email' => "$ENV{'USER'}\@broadinstitute.org", - 'run' => 0, -); - -# Get command-line options -GetOptions( - 'outputRoot=s' => \$args{'outputRoot'}, - 'pipelineRoot=s' => \$args{'pipelineRoot'}, - 'testRepositoryDir=s' => \$args{'testRepositoryDir'}, - 'testInputYAML=s' => \$args{'testInputYAML'}, - 'email=s' => \$args{'email'}, - 'run' => \$args{'run'}, - 'help|h' => \$args{'help'} -); - -# Should we emit some help? -if ($args{'help'}) { - print Dumper(\%args) . "\n"; - exit(0); -} - -# Set some directories -$args{'testDir'} = "$args{'outputRoot'}/" . &getTimestamp(); -$args{'testOutputDir'} = "$args{'testDir'}/output"; -$args{'logFile'} = "$args{'testDir'}/output.log"; - -# Set up environment -$ENV{'PATH'} = "$args{'testRepositoryDir'}/shell:$args{'testRepositoryDir'}/python:$ENV{'PATH'}"; - -# Execute jobs -&execute("echo \$PATH", 1, 10, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("mkdir -p $args{'testDir'}", 0, 10, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("mkdir -p $args{'testOutputDir'}", 0, 10, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("rm -rf $args{'logFile'}", 0, 10, $args{'run'}, $args{'logFile'}, $args{'email'}); - -&execute("cd $args{'testRepositoryDir'} && svn cleanup", 0, 120, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("cd $args{'testRepositoryDir'} && svn up", 0, 600, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("cd $args{'testRepositoryDir'} && ant clean", 0, 600, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("cd $args{'testRepositoryDir'} && ant dist playground oneoffs queue", 0, 1200, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("cd $args{'testRepositoryDir'} && svn info", 1, 10, $args{'run'}, $args{'logFile'}, $args{'email'}); - -&execute("cd $args{'testOutputDir'} && java -jar $args{'testRepositoryDir'}/dist/Queue.jar -jobProject QueuePipelineTest -S $args{'testRepositoryDir'}/scala/qscript/fullCallingPipeline.q -Y $args{'testInputYAML'} -refseqTable /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg18.txt --gatkjar $args{'testRepositoryDir'}/dist/GenomeAnalysisTK.jar -titv 3.0 -skipCleaning -bsub", 0, 120, $args{'run'}, $args{'logFile'}, $args{'email'}); -&execute("cd $args{'testOutputDir'} && java -jar $args{'testRepositoryDir'}/dist/Queue.jar -jobProject QueuePipelineTest -S $args{'testRepositoryDir'}/scala/qscript/fullCallingPipeline.q -Y $args{'testInputYAML'} -refseqTable /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg18.txt --gatkjar $args{'testRepositoryDir'}/dist/GenomeAnalysisTK.jar -titv 3.0 -skipCleaning -bsub -run", 0, 2400, $args{'run'}, $args{'logFile'}, $args{'email'}); - -&log("All tests completed successfully"); - -if ($args{'run'} == 1) { - open(LOG, ">$args{'logFile'}"); - print LOG join("\n", @log); - close(LOG); - - &emailLog("Analysis pipeline nightly test: succeeded", $args{'logFile'}, $args{'email'}); -} diff --git a/perl/runReleaseSanityCheck.pl b/perl/runReleaseSanityCheck.pl deleted file mode 100755 index 2d62b5dc6..000000000 --- a/perl/runReleaseSanityCheck.pl +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/perl - -# Usage:runReleaseSanityCheck [-sting ] [-dry] - -use Getopt::Long; - -$dry; -$sting = "/humgen/gsa-scr1/ebanks/Sting_dev/dist/GenomeAnalysisTK.jar"; -GetOptions( "dry!" => \$dry, - "sting=s" => \$sting); - -$command_prefix = "java -Xmx4096m -jar $sting -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l OFF"; - -$random_number = rand(); -$tmp_bam = "/tmp/$random_number.bam"; - -print "Executing DepthOfCoverage..."; -$command = "$command_prefix -T DepthOfCoverage -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.bam -L chr1:10000000-10100000 -o /dev/null"; -run($command, $dry); - -print "Executing CountCovariatesWholeExome..."; -$command = "$command_prefix -T CountCovariates -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.bam -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -L /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/whole_exome_agilent_designed_120.targets.chr1.interval_list -standard -OQ -recalFile /dev/null -XL chr1:1,000,000-247179187"; -run($command, $dry); - -print "Executing CountCovariatesWholeGenome..."; -$command = "$command_prefix -T CountCovariates -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.bam -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -L chr1:1,900,000-2,000,000 -standard -OQ -recalFile /dev/null"; -run($command, $dry); - -print "Executing TableRecalibratorWholeExome..."; -$command = "$command_prefix -T TableRecalibration -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.bam -L /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/whole_exome_agilent_designed_120.targets.chr1.interval_list -OQ -recalFile /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.recal.csv --out $tmp_bam -XL chr1:1,000,000-247179187"; -run($command, $dry); - -print "Executing TableRecalibratorWholeGenome..."; -$command = "$command_prefix -T TableRecalibration -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.bam -L chr1:1,950,000-2,000,000 -OQ -recalFile /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.recal.csv --out $tmp_bam"; -run($command, $dry); - -print "Executing IndelRealignerWholeExome..."; -$command = "$command_prefix -T IndelRealigner -LOD 5 -maxConsensuses 100 -greedy 100 -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.bam -L chr1:900,000-1,000,000 -compress 1 -targetIntervals /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.realigner.intervals -o $tmp_bam"; -run($command, $dry); - -print "Executing IndelRealignerWholeGenome..."; -$command = "$command_prefix -T IndelRealigner -LOD 5 -maxConsensuses 100 -greedy 100 -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.bam -L chr1:975,000-1,000,000 -compress 1 -targetIntervals /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.realigner.intervals -o $tmp_bam"; -run($command, $dry); - -print "Executing UnifiedGenotyperWholeExome..."; -$command = "$command_prefix -T UnifiedGenotyper -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.ESP.WEx.chr1.bam -L /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/whole_exome_agilent_designed_120.targets.chr1.interval_list -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -o /dev/null -XL chr1:1,500,000-247179187"; -run($command, $dry); - -print "Executing UnifiedGenotyperWholeGenome..."; -$command = "$command_prefix -T UnifiedGenotyper -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.bam -L chr1:750,000-1,000,000 -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -o /dev/null"; -run($command, $dry); - -print "Executing UnifiedGenotyperWholeGenomeMultithreaded..."; -$command = "$command_prefix -T UnifiedGenotyper -I /humgen/gsa-hpprojects/GATK/data/Evaluation_Data/NA12878.GAII.chr1.50MB.bam -L chr1:500,000-1,000,000 -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -o /dev/null -nt 4"; -run($command, $dry); - -unlink $tmp_bam; - -sub run { - - $command = $_[0]; - $dry = $_[1]; - - local $start = time; - - if ($dry) { - print "$command\n"; - } else { - system($command); - } - - $total_time = time - $start; - print " [$total_time sec]\n"; -} diff --git a/perl/sortByRef.pl b/perl/sortByRef.pl deleted file mode 100755 index 71d3f4477..000000000 --- a/perl/sortByRef.pl +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - - print "\nUsage:\n"; - print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n"; - - print " Sorts lines of the input file INFILE according\n"; - print " to the reference contig order specified by the\n"; - print " reference dictionary REF_DICT (.fai file).\n"; - print " The sort is stable. If -k option is not specified,\n"; - print " it is assumed that the contig name is the first\n"; - print " field in each line.\n\n"; - print " INPUT input file to sort. If '-' is specified, \n"; - print " then reads from STDIN.\n"; - print " REF_DICT .fai file, or ANY file that has contigs, in the\n"; - print " desired soting order, as its first column.\n"; - print " --k POS : contig name is in the field POS (1-based)\n"; - print " of input lines.\n\n"; - print " --tmp DIR : temp directory [default=/tmp]\n\n"; - - exit(1); -} - -my $pos = 1; -my $tmp = "/tmp"; -GetOptions( "k:i" => \$pos, - "tmp=s" => \$tmp); - -$pos--; - -usage() if ( scalar(@ARGV) == 0 ); - -if ( scalar(@ARGV) != 2 ) { - print "Wrong number of arguments\n"; - usage(); -} - -my $input_file = $ARGV[0]; -my $dict_file = $ARGV[1]; - - -open(DICT, "< $dict_file") or die("Can not open $dict_file: $!"); - -my %ref_order; - -my $n = 0; -while ( ) { - chomp; - my ($contig, $rest) = split "\t"; - die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} ); - - $ref_order{$contig} = $n; - $n++; -} - -close DICT; -#we have loaded contig ordering now - -my $INPUT; -if ($input_file eq "-" ) { - $INPUT = "STDIN"; -} else { - open($INPUT, "< $input_file") or die("Can not open $input_file: $!"); -} - -my %temp_outputs; - -while ( <$INPUT> ) { - - my @fields = split '\s'; - die("Specified field position exceeds the number of fields:\n$_") - if ( $pos >= scalar(@fields) ); - - my $contig = $fields[$pos]; - if ( $contig =~ m/:/ ) { - my @loc = split(/:/, $contig); - # print $contig . " " . $loc[0] . "\n"; - $contig = $loc[0] - } - chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line - - my $order; - if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; } - else { - $ref_order{$contig} = $n; - $order = $n; # input line has contig that was not in the dict; - $n++; # this contig will go at the end of the output, - # after all known contigs - } - - my $fhandle; - if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} } - else { - #print "opening $order $$ $_\n"; - open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or - die ( "Can not open temporary file $order: $!"); - $temp_outputs{$order} = $fhandle; - } - - # we got the handle to the temp file that keeps all - # lines with contig $contig - - print $fhandle $_; # send current line to its corresponding temp file -} - -close $INPUT; - -foreach my $f ( values %temp_outputs ) { close $f; } - -# now collect back into single output stream: - -for ( my $i = 0 ; $i < $n ; $i++ ) { - # if we did not have any lines on contig $i, then there's - # no temp file and nothing to do - next if ( ! defined $temp_outputs{$i} ) ; - - my $f; - open ( $f, "< $tmp/sortByRef.$$.$i.tmp" ); - while ( <$f> ) { print ; } - close $f; - - unlink "$tmp/sortByRef.$$.$i.tmp"; -} diff --git a/perl/splitAndEnqueueGATKjobs.pl b/perl/splitAndEnqueueGATKjobs.pl deleted file mode 100755 index 20641c0a7..000000000 --- a/perl/splitAndEnqueueGATKjobs.pl +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/perl -w - -# Runs a given GATK walker genome-wide, but splits up the jobs -# and then merges the results together. One should really use -# scatter-gather here, but I wanted to test against that framework -# (to ensure consistency) so I wrote this temporary script. -# Also, it allowed me to add in farm commands for wait ids -# (which is currently unavailable in scatter-gather). -# If intervals file is left blank, it splits by chromosome. - -use strict; -use Getopt::Long; - -sub usage { - print "Usage: perl enqueueGATKjobsByChromosome.pl\n\t-cmd \n\t-o \n\t[-oarg GATK output argument; default:o]\n\t[-i intervals file]\n\t[-n number of splits to make]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job farm job name]\n\t[-dry]\n\t[-bam is output format bam?; default:no]\n"; - exit(1); -} - -my $cmd = undef; -my $output = undef; -my $outputArg = "o"; -my $jobName = undef; -my $wait = undef; -my $queue = "gsa"; -my $breaks = 24; -my $intervalsFile = undef; -my $dry; -my $isBam; - -GetOptions( "cmd=s" => \$cmd, - "o=s" => \$output, - "oarg:s" => \$outputArg, - "job:s" => \$jobName, - "wait:s" => \$wait, - "n:i" => \$breaks, - "i:s" => \$intervalsFile, - "dry!" => \$dry, - "bam!" => \$isBam, - "q:s" => \$queue); - -usage() if ( !$cmd || !$output ); - -my @intervals; -if (!$intervalsFile) { - my $num = 1; - while ($num < 23) { - push(@intervals, $num); - $num++; - } - push(@intervals, "X"); - push(@intervals, "Y"); -} else { - open(FILE, "< $intervalsFile") or die "can't open $intervalsFile: $!"; - my @lines = ; - chomp(@lines); - my $linecount = scalar(@lines); - if ($linecount < $breaks) { - $breaks = $linecount; - } - - my $linesPerJob = $linecount / $breaks; - my $index = 0; - for (my $i = 1; $i < $breaks; $i++) { - my $interval = ""; - for (my $j = 0; $j < $linesPerJob; $j++) { - $interval .= "$lines[$index];"; - $index++; - } - push(@intervals, $interval); - } - my $interval = ""; - while ($index < $linecount) { - $interval .= "$lines[$index];"; - $index++; - } - push(@intervals, $interval); -} - -my $intervalcount = scalar(@intervals); -for (my $i = 0; $i < $intervalcount; $i++) { - enqueue($intervals[$i], $cmd, $output, $outputArg, $wait, $queue, $dry, $isBam, ($i+1)); -} - -mergeResults($output, $queue, $dry, $isBam, $jobName, $intervalcount); - -sub enqueue { - - my $interval = $_[0]; - my $cmd = $_[1]; - my $outArg = $_[3]; - my $waitid = $_[4]; - my $queue = $_[5]; - my $dry = $_[6]; - my $bam = $_[7]; - my $index = $_[8]; - - my $output = "$_[2].$index"; - if ($bam) { - $output .= ".bam"; - } - - my $bsub = "bsub -q $queue -o $output.sdout -J $_[2]"; - if ($waitid) { - $bsub .= " -w \"ended($waitid)\""; - } - - my $command = "$bsub $cmd -$outputArg $output -L $interval"; - execute($command, $dry); -} - -sub mergeResults { - - my $output = $_[0]; - my $queue = $_[1]; - my $dry = $_[2]; - my $bam = $_[3]; - my $jobName = $_[4]; - my $intervalcount = $_[5]; - - my $cmd = "bsub -q $queue -o $output.sdout -w \"ended($output)\""; - if ($jobName) { - $cmd .= " -J $jobName"; - } - - if ($bam) { - $cmd .= " samtools merge $output "; - for (my $i = 1; $i <= $intervalcount; $i++) { - $cmd .= "$output.$i.bam "; - } - } else { - $cmd .= " \"cat "; - for (my $i = 1; $i <= $intervalcount; $i++) { - $cmd .= "$output.$i "; - } - $cmd .= "> $output\""; - } - - execute($cmd, $dry); -} - -sub execute { - - my $cmd = $_[0]; - my $dry = $_[1]; - - if ($dry) { - print "$cmd\n"; - } else { - system($cmd); - } -} diff --git a/perl/sync1000Genomes/README.sync b/perl/sync1000Genomes/README.sync deleted file mode 100644 index d8492750a..000000000 --- a/perl/sync1000Genomes/README.sync +++ /dev/null @@ -1,9 +0,0 @@ -Before sync'ing 1000 Genomes, you need to be logged in as gsa-dev: -% sudo -s -u gsa-dev -[Note that this step needs to be done before subsequent steps for stability] - -To use Aspera, you'll need to ssh into one of the appropriate machines: -% ssh vbigtube or mirror - -[The NCBI Aspera source is: anonftp@ftp-private.ncbi.nih.gov:/1000genomes/ftp/] - diff --git a/perl/sync1000Genomes/checkMD5s.pl b/perl/sync1000Genomes/checkMD5s.pl deleted file mode 100755 index 199b2d04b..000000000 --- a/perl/sync1000Genomes/checkMD5s.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl -w - -use Getopt::Long; - -sub usage { - print "Usage: perl checkMD5s.pl\n\t-ai \n\t-o \n"; - exit(1); -} - - -my $ai = undef; -my $out = undef; -GetOptions( "ai=s" => \$ai, - "o=s" => \$out); - -usage() if ( !$ai || !$out ); - -open(OUT, "> $out") or die "can't open $out: $!"; - -open(LIST, "< $ai") or die "can't open $ai: $!"; -while ( ) { - @pieces = split(' ', $_); - if ( @pieces == 6 ) { - check($pieces[0], $pieces[1]); - check($pieces[2], $pieces[3]); - check($pieces[4], $pieces[5]); - } -} - -close(LIST); -close(OUT); - -sub check { - - my $file = $_[0]; - my $target = $_[1]; - - print "Checking /humgen/1kg/DCC/ftp/$file\n"; - @md5 = split(' ', `md5sum /humgen/1kg/DCC/ftp/$file`); - if ( $md5[0] ne $target ) { - print OUT "$file\t$md5[0]\t$target\n"; - } -} diff --git a/perl/sync1000Genomes/findFilesNotInAlignmentIndex.sh b/perl/sync1000Genomes/findFilesNotInAlignmentIndex.sh deleted file mode 100755 index 1592b1fff..000000000 --- a/perl/sync1000Genomes/findFilesNotInAlignmentIndex.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -find /humgen/1kg/DCC/ftp/data/ -type f | awk -F "/" '{print $6 "/" $7 "/" $8 "/" $9}' | sort > filesWeHave.list -grep -v MD5 /humgen/1kg/DCC/ftp/alignment.index | awk '{print $1 "\n" $3 "\n" $5}' | sort > filesWeWant.list -comm -23 filesWeHave.list filesWeWant.list > filesToDelete.list -comm -13 filesWeHave.list filesWeWant.list > filesToGet.list - diff --git a/perl/sync1000Genomes/runAspera.pl b/perl/sync1000Genomes/runAspera.pl deleted file mode 100755 index cf4c6554d..000000000 --- a/perl/sync1000Genomes/runAspera.pl +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/perl -w - -# Runs Aspera to pull down files - -use strict; -use Getopt::Long; - -my $source = undef; -my $dest = "."; -GetOptions( "source=s" => \$source, - "dest=s" => \$dest); - -if ( !$source) { - print "Usage: runAspera.pl\n\t-source \t\n\t-dest \t\t\n"; - exit(1); -} - -my $cmd = "ascp -i /opt/aspera/etc/asperaweb_id_dsa.putty -k2 -QTr -l2G -d -v $source $dest"; -system($cmd); diff --git a/perl/sync1000Genomes/runWget.pl b/perl/sync1000Genomes/runWget.pl deleted file mode 100755 index c2bdac748..000000000 --- a/perl/sync1000Genomes/runWget.pl +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/perl -w - -# Runs Wget to pull down a file - -use strict; -use Getopt::Long; - -my $file = undef; -GetOptions( "file=s" => \$file); - -if ( !$file) { - print "Usage: runWget.pl\n\t-file \t\n"; - exit(1); -} - -chomp($file); -my $cmd = "wget -O /humgen/1kg/DCC/ftp/$file ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/$file"; -print "$cmd\n"; -system($cmd); diff --git a/perl/sync1000Genomes/syncFilesInList.pl b/perl/sync1000Genomes/syncFilesInList.pl deleted file mode 100755 index 6a7ac83c5..000000000 --- a/perl/sync1000Genomes/syncFilesInList.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl -w - -use Getopt::Long; - -sub usage { - print "Usage: perl syncFilesInList.pl\n\t-files \n\t-protocol [defaults to 'aspera'; can also use 'wget']\n\t[-dry]\n"; - exit(1); -} - -my $files = undef; -my $dry; -my $protocol = "aspera"; - -GetOptions( "files=s" => \$files, - "dry!" => \$dry, - "protocol=s" => \$protocol); - -usage() if ( !$files ); - -open(LIST, "< $files") or die "can't open $files: $!"; -while ( ) { - chomp($_); - if ( $protocol eq "aspera" ) { - $_ =~ m/data\/(.*)\/alignment.*/; - $cmd = "./runAspera.pl -source anonftp\@ftp-private.ncbi.nih.gov:/1000genomes/ftp/$_ -dest /humgen/1kg/DCC/ftp/data/$1/alignment/"; - execute($cmd, $dry); - } elsif ( $protocol eq "wget" ) { - $cmd = "./runWget.pl -file $_"; - execute($cmd, $dry); - } else { - usage(); - } -} -close(LIST); - -sub execute { - - my $cmd = $_[0]; - my $dry = $_[1]; - - if ($dry) { - print "$cmd\n"; - } else { - system($cmd); - } -} diff --git a/perl/tablesorter.pl b/perl/tablesorter.pl deleted file mode 100644 index 7b8be28d3..000000000 --- a/perl/tablesorter.pl +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/perl -w - -# sorts by reference - -use strict; -use Getopt::Long; - -my $in = undef; -my $gatk = undef; -my $ref = undef; -my $out = undef; -my $tmp = "/tmp"; -my $contig = undef; -my $startpos =undef; -my $mark =undef; - -GetOptions( "in=s" => \$in, - "gatk=s" => \$gatk, - "ref=s" => \$ref, - "out=s" => \$out, - "tmp=s" => \$tmp, - "startpos=s" => \$startpos, - "contig=s" => \$contig, - "headermarker=s" => \$mark); - - if ( !$in || !$gatk || !$ref || !$out ) { - print "Usage: tablesorter.pl\n\t-in \t\t\t\n\t-gatk \t\t\t\n\t-ref \t\t\t\n\t-out \t\t\t\n\t-tmp \t\t\t\n\t-startpos \t\t\n\t-contig \t\t\n\t-headermarker \t\t\n"; - print "Example: ./tablesorter.pl\n\t-in test.foo\n\t-ref /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta\n\t-gatk /humgen/gsa-s r1/ebanks/Sting_dev\n\t-out test.bar\n\t-tmp /broad/shptmp\n\t-startpos /2\n\t-contig 1\n\t-headermarker haplotype \n"; - exit(1); - } - - -# we need to sort the converted table now - print "\nSorting the table...\n"; - open(SORTED, ">$out") or die "can't open $out: $!"; - -# write the header - open(UNSORTED, "< $in") or die "can't open $in: $!"; - my $line = ; - print SORTED "$line"; - close(UNSORTED); - close(SORTED); - -print "$line is the header"; - my $cmd = "grep $mark -v $in | sort -n -k$startpos -T $tmp | $gatk/perl/sortByRef.pl --k $contig --tmp $tmp - $ref.fai >> $out"; - system($cmd); -# clean upunlink $unsorted_table; - - print "\nDone!\n"; diff --git a/chainFiles/b36tob37.chain b/public/chainFiles/b36tob37.chain similarity index 100% rename from chainFiles/b36tob37.chain rename to public/chainFiles/b36tob37.chain diff --git a/chainFiles/b37tob36.chain b/public/chainFiles/b37tob36.chain similarity index 100% rename from chainFiles/b37tob36.chain rename to public/chainFiles/b37tob36.chain diff --git a/chainFiles/b37tohg18.chain b/public/chainFiles/b37tohg18.chain similarity index 100% rename from chainFiles/b37tohg18.chain rename to public/chainFiles/b37tohg18.chain diff --git a/chainFiles/b37tohg19.chain b/public/chainFiles/b37tohg19.chain similarity index 100% rename from chainFiles/b37tohg19.chain rename to public/chainFiles/b37tohg19.chain diff --git a/chainFiles/hg18tob37.chain b/public/chainFiles/hg18tob37.chain similarity index 100% rename from chainFiles/hg18tob37.chain rename to public/chainFiles/hg18tob37.chain diff --git a/chainFiles/hg19toHg18.chain b/public/chainFiles/hg19toHg18.chain similarity index 100% rename from chainFiles/hg19toHg18.chain rename to public/chainFiles/hg19toHg18.chain diff --git a/chainFiles/makeChains.py b/public/chainFiles/makeChains.py similarity index 100% rename from chainFiles/makeChains.py rename to public/chainFiles/makeChains.py diff --git a/doc/Ant_Help.tex b/public/doc/Ant_Help.tex similarity index 100% rename from doc/Ant_Help.tex rename to public/doc/Ant_Help.tex diff --git a/doc/GATK_Coding_Standards.pdf b/public/doc/GATK_Coding_Standards.pdf similarity index 100% rename from doc/GATK_Coding_Standards.pdf rename to public/doc/GATK_Coding_Standards.pdf diff --git a/doc/GATK_Coding_Standards.tex b/public/doc/GATK_Coding_Standards.tex similarity index 100% rename from doc/GATK_Coding_Standards.tex rename to public/doc/GATK_Coding_Standards.tex diff --git a/doc/README b/public/doc/README similarity index 100% rename from doc/README rename to public/doc/README diff --git a/java/config/log4j.properties b/public/java/config/log4j.properties similarity index 100% rename from java/config/log4j.properties rename to public/java/config/log4j.properties diff --git a/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java similarity index 100% rename from java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java rename to public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java diff --git a/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java similarity index 100% rename from java/src/net/sf/samtools/GATKBAMFileSpan.java rename to public/java/src/net/sf/samtools/GATKBAMFileSpan.java diff --git a/java/src/net/sf/samtools/GATKBin.java b/public/java/src/net/sf/samtools/GATKBin.java similarity index 100% rename from java/src/net/sf/samtools/GATKBin.java rename to public/java/src/net/sf/samtools/GATKBin.java diff --git a/java/src/net/sf/samtools/GATKBinList.java b/public/java/src/net/sf/samtools/GATKBinList.java similarity index 100% rename from java/src/net/sf/samtools/GATKBinList.java rename to public/java/src/net/sf/samtools/GATKBinList.java diff --git a/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java similarity index 100% rename from java/src/net/sf/samtools/GATKChunk.java rename to public/java/src/net/sf/samtools/GATKChunk.java diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java rename to public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java similarity index 100% rename from java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java rename to public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java diff --git a/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Argument.java rename to public/java/src/org/broadinstitute/sting/commandline/Argument.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentException.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentException.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentSource.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java diff --git a/java/src/org/broadinstitute/sting/commandline/ClassType.java b/public/java/src/org/broadinstitute/sting/commandline/ClassType.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ClassType.java rename to public/java/src/org/broadinstitute/sting/commandline/ClassType.java diff --git a/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java rename to public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java diff --git a/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java rename to public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java diff --git a/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java b/public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java rename to public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java diff --git a/java/src/org/broadinstitute/sting/commandline/Gather.java b/public/java/src/org/broadinstitute/sting/commandline/Gather.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Gather.java rename to public/java/src/org/broadinstitute/sting/commandline/Gather.java diff --git a/java/src/org/broadinstitute/sting/commandline/Gatherer.java b/public/java/src/org/broadinstitute/sting/commandline/Gatherer.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Gatherer.java rename to public/java/src/org/broadinstitute/sting/commandline/Gatherer.java diff --git a/java/src/org/broadinstitute/sting/commandline/Hidden.java b/public/java/src/org/broadinstitute/sting/commandline/Hidden.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Hidden.java rename to public/java/src/org/broadinstitute/sting/commandline/Hidden.java diff --git a/java/src/org/broadinstitute/sting/commandline/Input.java b/public/java/src/org/broadinstitute/sting/commandline/Input.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Input.java rename to public/java/src/org/broadinstitute/sting/commandline/Input.java diff --git a/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java b/public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java rename to public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java diff --git a/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Output.java rename to public/java/src/org/broadinstitute/sting/commandline/Output.java diff --git a/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ParsingEngine.java rename to public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java diff --git a/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/ParsingMethod.java rename to public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java diff --git a/java/src/org/broadinstitute/sting/commandline/Tags.java b/public/java/src/org/broadinstitute/sting/commandline/Tags.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/Tags.java rename to public/java/src/org/broadinstitute/sting/commandline/Tags.java diff --git a/java/src/org/broadinstitute/sting/commandline/package-info.java b/public/java/src/org/broadinstitute/sting/commandline/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/commandline/package-info.java rename to public/java/src/org/broadinstitute/sting/commandline/package-info.java diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java b/public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java similarity index 100% rename from java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java rename to public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java similarity index 100% rename from java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java rename to public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java b/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java similarity index 100% rename from java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java rename to public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java rename to public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java rename to public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java diff --git a/java/src/org/broadinstitute/sting/gatk/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/DownsampleType.java rename to public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java diff --git a/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java rename to public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java rename to public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/ReadMetrics.java rename to public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java diff --git a/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/ReadProperties.java rename to public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java diff --git a/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/WalkerManager.java rename to public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java rename to public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java rename to public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java rename to public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java rename to public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java rename to public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java b/public/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java rename to public/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java rename to public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java b/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java b/public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java diff --git a/java/src/org/broadinstitute/sting/gatk/executive/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/executive/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/filters/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java rename to public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java rename to public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java rename to public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java rename to public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java rename to public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java rename to public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java rename to public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/iterators/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java rename to public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/report/GATKReport.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/traversals/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Allows.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/By.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/By.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/By.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/By.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/RMD.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Reference.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Requires.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Walker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Window.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/Window.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoarseCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoarseCoverageWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoarseCoverageWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoarseCoverageWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStatsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStatsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStatsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStatsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java similarity index 99% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 89bdd53c8..e7b9cfa68 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,9 +32,9 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; /*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; +import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDataManager; +import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDatum; +import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalibrationArgumentCollection; */import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/package-info.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/Analysis.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/Analysis.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/Analysis.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/Analysis.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/DataPoint.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/DataPoint.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/DataPoint.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/tags/DataPoint.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java similarity index 100% rename from java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java diff --git a/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java b/public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java rename to public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java diff --git a/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java b/public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java similarity index 100% rename from java/src/org/broadinstitute/sting/jna/clibrary/LibC.java rename to public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java diff --git a/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java similarity index 100% rename from java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java rename to public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java diff --git a/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java similarity index 100% rename from java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java rename to public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java diff --git a/java/src/org/broadinstitute/sting/queue/QueueVersion.java b/public/java/src/org/broadinstitute/sting/queue/QueueVersion.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/QueueVersion.java rename to public/java/src/org/broadinstitute/sting/queue/QueueVersion.java diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java rename to public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java rename to public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java rename to public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java rename to public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java rename to public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java diff --git a/java/src/org/broadinstitute/sting/queue/package-info.java b/public/java/src/org/broadinstitute/sting/queue/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/queue/package-info.java rename to public/java/src/org/broadinstitute/sting/queue/package-info.java diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/BaseUtils.java rename to public/java/src/org/broadinstitute/sting/utils/BaseUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/DisjointSet.java b/public/java/src/org/broadinstitute/sting/utils/DisjointSet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/DisjointSet.java rename to public/java/src/org/broadinstitute/sting/utils/DisjointSet.java diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/GenomeLoc.java rename to public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/GenomeLocParser.java rename to public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java rename to public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java diff --git a/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java b/public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java rename to public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java diff --git a/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java b/public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java rename to public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java diff --git a/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/IndelUtils.java rename to public/java/src/org/broadinstitute/sting/utils/IndelUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/MannWhitneyU.java rename to public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/MathUtils.java rename to public/java/src/org/broadinstitute/sting/utils/MathUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/MendelianViolation.java rename to public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java diff --git a/java/src/org/broadinstitute/sting/utils/PathUtils.java b/public/java/src/org/broadinstitute/sting/utils/PathUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/PathUtils.java rename to public/java/src/org/broadinstitute/sting/utils/PathUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/QualityUtils.java rename to public/java/src/org/broadinstitute/sting/utils/QualityUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java rename to public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java diff --git a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java rename to public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/SampleUtils.java rename to public/java/src/org/broadinstitute/sting/utils/SampleUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java rename to public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/SimpleTimer.java rename to public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/Utils.java rename to public/java/src/org/broadinstitute/sting/utils/Utils.java diff --git a/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java rename to public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java diff --git a/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java rename to public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java diff --git a/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java b/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java rename to public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/baq/BAQ.java rename to public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java rename to public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java b/public/java/src/org/broadinstitute/sting/utils/bed/BedParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/bed/BedParser.java rename to public/java/src/org/broadinstitute/sting/utils/bed/BedParser.java diff --git a/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java rename to public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java rename to public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java diff --git a/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java rename to public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java diff --git a/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java b/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java rename to public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java diff --git a/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java rename to public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java rename to public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java rename to public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java rename to public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java rename to public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/ManualSortingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/ManualSortingVCFWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/ManualSortingVCFWriter.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/ManualSortingVCFWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriter.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineType.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineType.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineType.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineType.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderVersion.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderVersion.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderVersion.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderVersion.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/CircularArray.java rename to public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java b/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java rename to public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java rename to public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/Pair.java b/public/java/src/org/broadinstitute/sting/utils/collections/Pair.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/Pair.java rename to public/java/src/org/broadinstitute/sting/utils/collections/Pair.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java b/public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java rename to public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java diff --git a/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java rename to public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java rename to public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java b/public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java rename to public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java rename to public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java rename to public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/exceptions/StingException.java rename to public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/exceptions/UserException.java rename to public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java diff --git a/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java b/public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java rename to public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java rename to public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java diff --git a/java/src/org/broadinstitute/sting/utils/fasta/package-info.java b/public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/fasta/package-info.java rename to public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java diff --git a/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java b/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java rename to public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java diff --git a/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java b/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java rename to public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java diff --git a/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java rename to public/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java rename to public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java diff --git a/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java b/public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java rename to public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java diff --git a/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java rename to public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java diff --git a/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java rename to public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java diff --git a/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java rename to public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java diff --git a/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java rename to public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java diff --git a/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java rename to public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java diff --git a/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java rename to public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java diff --git a/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java b/public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java rename to public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java rename to public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java rename to public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java rename to public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java rename to public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java rename to public/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java rename to public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/package-info.java b/public/java/src/org/broadinstitute/sting/utils/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/package-info.java rename to public/java/src/org/broadinstitute/sting/utils/package-info.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java rename to public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup2/Notes b/public/java/src/org/broadinstitute/sting/utils/pileup2/Notes similarity index 100% rename from java/src/org/broadinstitute/sting/utils/pileup2/Notes rename to public/java/src/org/broadinstitute/sting/utils/pileup2/Notes diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java rename to public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java rename to public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriter.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java rename to public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java rename to public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java rename to public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java rename to public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java rename to public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java rename to public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/sam/package-info.java b/public/java/src/org/broadinstitute/sting/utils/sam/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/sam/package-info.java rename to public/java/src/org/broadinstitute/sting/utils/sam/package-info.java diff --git a/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java rename to public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java rename to public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/text/XReadLines.java rename to public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java rename to public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java rename to public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java rename to public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java rename to public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java b/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java rename to public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java rename to public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java rename to public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java rename to public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java rename to public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java diff --git a/java/src/org/broadinstitute/sting/utils/threading/package-info.java b/public/java/src/org/broadinstitute/sting/utils/threading/package-info.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/threading/package-info.java rename to public/java/src/org/broadinstitute/sting/utils/threading/package-info.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java diff --git a/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java b/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java rename to public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java diff --git a/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java b/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java rename to public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java diff --git a/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java b/public/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java rename to public/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java diff --git a/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java b/public/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java rename to public/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java diff --git a/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java b/public/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java similarity index 100% rename from java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java rename to public/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java diff --git a/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java b/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java similarity index 100% rename from java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java rename to public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java diff --git a/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java b/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java similarity index 100% rename from java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java rename to public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java diff --git a/java/test/net/sf/samtools/GATKChunkUnitTest.java b/public/java/test/net/sf/samtools/GATKChunkUnitTest.java similarity index 100% rename from java/test/net/sf/samtools/GATKChunkUnitTest.java rename to public/java/test/net/sf/samtools/GATKChunkUnitTest.java diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/BaseTest.java rename to public/java/test/org/broadinstitute/sting/BaseTest.java diff --git a/java/test/org/broadinstitute/sting/StingTextReporter.java b/public/java/test/org/broadinstitute/sting/StingTextReporter.java similarity index 100% rename from java/test/org/broadinstitute/sting/StingTextReporter.java rename to public/java/test/org/broadinstitute/sting/StingTextReporter.java diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/WalkerTest.java rename to public/java/test/org/broadinstitute/sting/WalkerTest.java diff --git a/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java rename to public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java diff --git a/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java rename to public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java b/public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java rename to public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java diff --git a/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/MWUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/variantcontext/AlleleUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/AlleleUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/variantcontext/AlleleUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/variantcontext/AlleleUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java diff --git a/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java similarity index 100% rename from java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java diff --git a/packages/Aligner.xml b/public/packages/Aligner.xml similarity index 100% rename from packages/Aligner.xml rename to public/packages/Aligner.xml diff --git a/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml similarity index 100% rename from packages/AnalyzeCovariates.xml rename to public/packages/AnalyzeCovariates.xml diff --git a/packages/CreatePackager.xsl b/public/packages/CreatePackager.xsl similarity index 100% rename from packages/CreatePackager.xsl rename to public/packages/CreatePackager.xsl diff --git a/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml similarity index 100% rename from packages/FindContaminatingReadGroups.xml rename to public/packages/FindContaminatingReadGroups.xml diff --git a/packages/GATK-Picard.xml b/public/packages/GATK-Picard.xml similarity index 100% rename from packages/GATK-Picard.xml rename to public/packages/GATK-Picard.xml diff --git a/packages/GATKEngine.xml b/public/packages/GATKEngine.xml similarity index 100% rename from packages/GATKEngine.xml rename to public/packages/GATKEngine.xml diff --git a/packages/GATKResources.xml b/public/packages/GATKResources.xml similarity index 100% rename from packages/GATKResources.xml rename to public/packages/GATKResources.xml diff --git a/packages/GenomeAnalysisTK.xml b/public/packages/GenomeAnalysisTK.xml similarity index 100% rename from packages/GenomeAnalysisTK.xml rename to public/packages/GenomeAnalysisTK.xml diff --git a/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml similarity index 100% rename from packages/IndelGenotyper.xml rename to public/packages/IndelGenotyper.xml diff --git a/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml similarity index 100% rename from packages/LocalRealignmentAroundIndels.xml rename to public/packages/LocalRealignmentAroundIndels.xml diff --git a/packages/PicardPrivate.xml b/public/packages/PicardPrivate.xml similarity index 100% rename from packages/PicardPrivate.xml rename to public/packages/PicardPrivate.xml diff --git a/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml similarity index 100% rename from packages/QualityScoresRecalibration.xml rename to public/packages/QualityScoresRecalibration.xml diff --git a/packages/Queue.xml b/public/packages/Queue.xml similarity index 100% rename from packages/Queue.xml rename to public/packages/Queue.xml diff --git a/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml similarity index 100% rename from packages/RMDIndexer.xml rename to public/packages/RMDIndexer.xml diff --git a/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml similarity index 100% rename from packages/UnifiedGenotyper.xml rename to public/packages/UnifiedGenotyper.xml diff --git a/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml similarity index 100% rename from packages/VariantAnnotator.xml rename to public/packages/VariantAnnotator.xml diff --git a/packages/VariantEval.xml b/public/packages/VariantEval.xml similarity index 100% rename from packages/VariantEval.xml rename to public/packages/VariantEval.xml diff --git a/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml similarity index 100% rename from packages/VariantFiltration.xml rename to public/packages/VariantFiltration.xml diff --git a/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml similarity index 100% rename from packages/VariantRecalibration.xml rename to public/packages/VariantRecalibration.xml diff --git a/scala/qscript/core/DataProcessingPipeline.scala b/public/scala/qscript/core/DataProcessingPipeline.scala similarity index 100% rename from scala/qscript/core/DataProcessingPipeline.scala rename to public/scala/qscript/core/DataProcessingPipeline.scala diff --git a/scala/qscript/core/GATKResourcesBundle.scala b/public/scala/qscript/core/GATKResourcesBundle.scala similarity index 100% rename from scala/qscript/core/GATKResourcesBundle.scala rename to public/scala/qscript/core/GATKResourcesBundle.scala diff --git a/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala similarity index 100% rename from scala/qscript/core/MethodsDevelopmentCallingPipeline.scala rename to public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala diff --git a/scala/qscript/core/StandardVariantEvaluation.scala b/public/scala/qscript/core/StandardVariantEvaluation.scala similarity index 100% rename from scala/qscript/core/StandardVariantEvaluation.scala rename to public/scala/qscript/core/StandardVariantEvaluation.scala diff --git a/scala/qscript/examples/ExampleCountLoci.scala b/public/scala/qscript/examples/ExampleCountLoci.scala similarity index 100% rename from scala/qscript/examples/ExampleCountLoci.scala rename to public/scala/qscript/examples/ExampleCountLoci.scala diff --git a/scala/qscript/examples/ExampleCountReads.scala b/public/scala/qscript/examples/ExampleCountReads.scala similarity index 100% rename from scala/qscript/examples/ExampleCountReads.scala rename to public/scala/qscript/examples/ExampleCountReads.scala diff --git a/scala/qscript/examples/ExampleCustomWalker.scala b/public/scala/qscript/examples/ExampleCustomWalker.scala similarity index 100% rename from scala/qscript/examples/ExampleCustomWalker.scala rename to public/scala/qscript/examples/ExampleCustomWalker.scala diff --git a/scala/qscript/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/examples/ExampleUnifiedGenotyper.scala similarity index 100% rename from scala/qscript/examples/ExampleUnifiedGenotyper.scala rename to public/scala/qscript/examples/ExampleUnifiedGenotyper.scala diff --git a/scala/qscript/examples/HelloWorld.scala b/public/scala/qscript/examples/HelloWorld.scala similarity index 100% rename from scala/qscript/examples/HelloWorld.scala rename to public/scala/qscript/examples/HelloWorld.scala diff --git a/scala/qscript/lib/Vcf2Table.q b/public/scala/qscript/lib/Vcf2Table.q similarity index 100% rename from scala/qscript/lib/Vcf2Table.q rename to public/scala/qscript/lib/Vcf2Table.q diff --git a/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/QCommandLine.scala rename to public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala diff --git a/scala/src/org/broadinstitute/sting/queue/QException.scala b/public/scala/src/org/broadinstitute/sting/queue/QException.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/QException.scala rename to public/scala/src/org/broadinstitute/sting/queue/QException.scala diff --git a/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/QScript.scala rename to public/scala/src/org/broadinstitute/sting/queue/QScript.scala diff --git a/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/QScriptManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/QSettings.scala rename to public/scala/src/org/broadinstitute/sting/queue/QSettings.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/QNode.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala diff --git a/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala rename to public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala b/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/QFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala rename to public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala rename to public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/CommandLineJob.scala b/public/scala/src/org/broadinstitute/sting/queue/util/CommandLineJob.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/CommandLineJob.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/CommandLineJob.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/JobExitException.scala b/public/scala/src/org/broadinstitute/sting/queue/util/JobExitException.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/JobExitException.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/JobExitException.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/Logging.scala b/public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/Logging.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/ProcessController.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ProcessController.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/ProcessController.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/ProcessController.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/Retry.scala b/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/Retry.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/RetryException.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/ShellJob.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ShellJob.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/ShellJob.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/ShellJob.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala diff --git a/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala diff --git a/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala b/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala rename to public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala diff --git a/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala b/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala rename to public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala diff --git a/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala b/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala similarity index 100% rename from scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala rename to public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala diff --git a/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/util/IOUtilsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/IOUtilsUnitTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/util/IOUtilsUnitTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/util/IOUtilsUnitTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/util/ShellJobUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/ShellJobUnitTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/util/ShellJobUnitTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/util/ShellJobUnitTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala diff --git a/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala similarity index 100% rename from scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala rename to public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala diff --git a/testdata/HiSeq.10000.vcf b/public/testdata/HiSeq.10000.vcf similarity index 100% rename from testdata/HiSeq.10000.vcf rename to public/testdata/HiSeq.10000.vcf diff --git a/testdata/TabularDataTest.dat b/public/testdata/TabularDataTest.dat similarity index 100% rename from testdata/TabularDataTest.dat rename to public/testdata/TabularDataTest.dat diff --git a/testdata/TabularDataTest2.dat b/public/testdata/TabularDataTest2.dat similarity index 100% rename from testdata/TabularDataTest2.dat rename to public/testdata/TabularDataTest2.dat diff --git a/testdata/ValidatingPileupTargets.list b/public/testdata/ValidatingPileupTargets.list similarity index 100% rename from testdata/ValidatingPileupTargets.list rename to public/testdata/ValidatingPileupTargets.list diff --git a/testdata/defaultGATKConfig.cfg b/public/testdata/defaultGATKConfig.cfg similarity index 100% rename from testdata/defaultGATKConfig.cfg rename to public/testdata/defaultGATKConfig.cfg diff --git a/testdata/exampleBAM.bam b/public/testdata/exampleBAM.bam similarity index 100% rename from testdata/exampleBAM.bam rename to public/testdata/exampleBAM.bam diff --git a/testdata/exampleBAM.bam.bai b/public/testdata/exampleBAM.bam.bai similarity index 100% rename from testdata/exampleBAM.bam.bai rename to public/testdata/exampleBAM.bam.bai diff --git a/testdata/exampleFASTA.dict b/public/testdata/exampleFASTA.dict similarity index 100% rename from testdata/exampleFASTA.dict rename to public/testdata/exampleFASTA.dict diff --git a/testdata/exampleFASTA.fasta b/public/testdata/exampleFASTA.fasta similarity index 100% rename from testdata/exampleFASTA.fasta rename to public/testdata/exampleFASTA.fasta diff --git a/testdata/exampleFASTA.fasta.fai b/public/testdata/exampleFASTA.fasta.fai similarity index 100% rename from testdata/exampleFASTA.fasta.fai rename to public/testdata/exampleFASTA.fasta.fai diff --git a/testdata/exampleNORG.bam b/public/testdata/exampleNORG.bam similarity index 100% rename from testdata/exampleNORG.bam rename to public/testdata/exampleNORG.bam diff --git a/testdata/exampleNORG.bam.bai b/public/testdata/exampleNORG.bam.bai similarity index 100% rename from testdata/exampleNORG.bam.bai rename to public/testdata/exampleNORG.bam.bai diff --git a/testdata/gFFTest.gff b/public/testdata/gFFTest.gff similarity index 100% rename from testdata/gFFTest.gff rename to public/testdata/gFFTest.gff diff --git a/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap b/public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap similarity index 100% rename from testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap rename to public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap diff --git a/testdata/logisticParamsTest.list b/public/testdata/logisticParamsTest.list similarity index 100% rename from testdata/logisticParamsTest.list rename to public/testdata/logisticParamsTest.list diff --git a/testdata/recalConfig_1KG.cfg b/public/testdata/recalConfig_1KG.cfg similarity index 100% rename from testdata/recalConfig_1KG.cfg rename to public/testdata/recalConfig_1KG.cfg diff --git a/testdata/sampleBedFile.bed b/public/testdata/sampleBedFile.bed similarity index 100% rename from testdata/sampleBedFile.bed rename to public/testdata/sampleBedFile.bed diff --git a/testdata/small.dbsnp.rod b/public/testdata/small.dbsnp.rod similarity index 100% rename from testdata/small.dbsnp.rod rename to public/testdata/small.dbsnp.rod diff --git a/testdata/tranches.4.txt b/public/testdata/tranches.4.txt similarity index 100% rename from testdata/tranches.4.txt rename to public/testdata/tranches.4.txt diff --git a/testdata/tranches.6.txt b/public/testdata/tranches.6.txt similarity index 100% rename from testdata/tranches.6.txt rename to public/testdata/tranches.6.txt diff --git a/testdata/tranches.raw.dat b/public/testdata/tranches.raw.dat similarity index 100% rename from testdata/tranches.raw.dat rename to public/testdata/tranches.raw.dat diff --git a/testdata/vcf/vcfWithGenotypes.vcf b/public/testdata/vcf/vcfWithGenotypes.vcf similarity index 100% rename from testdata/vcf/vcfWithGenotypes.vcf rename to public/testdata/vcf/vcfWithGenotypes.vcf diff --git a/testdata/vcf/vcfWithoutGenotypes.vcf b/public/testdata/vcf/vcfWithoutGenotypes.vcf similarity index 100% rename from testdata/vcf/vcfWithoutGenotypes.vcf rename to public/testdata/vcf/vcfWithoutGenotypes.vcf diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py deleted file mode 100755 index 07d5b50d9..000000000 --- a/python/1kgStatsForCalls.py +++ /dev/null @@ -1,277 +0,0 @@ -# import farm_commands2 -import os.path -import sys -from optparse import OptionParser -import glob -import operator -import itertools -import re -import vcfReader -import string -import gzip - -def openMaybeGZ(filename): - if ( filename.endswith(".gz") ): - return gzip.open(filename) - else: - return open(filename) - -def average(l): - sum = reduce(operator.add, l, 0) - return sum / (1.0 * (max(len(l), 1))) - -def printHeaderSep(): - print - print ''.join(['-'] * 80) - -class Sample: - def __init__(self, name): - self.name = name - self.rawBases = 0 - self.mappedBases = 0 - self.nSNPs = 0 - self.nIndels = 0 - - def getName(self): return self.name - def getNSNPs(self): return self.nSNPs - def getNIndels(self): return self.nIndels - - def __str__(self): - return '[%s rawBases=%d mappedBases=%d percentMapped=%.2f nSNPs=%d nIndels=%d]' % (self.name, self.rawBases, self.mappedBases, (self.mappedBases * 100.0) / max(self.rawBases,1), self.nSNPs, self.nIndels) - __repr__ = __str__ - -def flatFileIterator(file, fields = None, skip = 0): - count = 0 - for line in openMaybeGZ(file): - count += 1 - if count > skip: - s = map(string.strip, line.split('\t')) - if ( fields != None ): - s = map(lambda field: s[field], fields) - - if len(s) == 1: s = s[0] - yield s - -# 1. FASTQ_FILE, path to fastq file on ftp site -# 2. MD5, md5sum of file -# 3. RUN_ID, SRA/ERA run accession -# 4. STUDY_ID, SRA/ERA study accession -# 5. STUDY_NAME, Name of stury -# 6. CENTER_NAME, Submission centre name -# 7. SUBMISSION_ID, SRA/ERA submission accession -# 8. SUBMISSION_DATE, Date sequence submitted, YYYY-MM-DAY -# 9. SAMPLE_ID, SRA/ERA sample accession -# 10. SAMPLE_NAME, Sample name -# 11. POPULATION, Sample population -# 12. EXPERIMENT_ID, Experiment accession -# 13. INSTRUMENT_PLATFORM, Type of sequencing machine -# 14. INSTRUMENT_MODEL, Model of sequencing machine -# 15. LIBRARY_NAME, Library name -# 16. RUN_NAME, Name of machine run -# 17. RUN_BLOCK_NAME, Name of machine run sector -# 18. INSERT_SIZE, Submitter specifed insert size -# 19. LIBRARY_LAYOUT, Library layout, this can be either PAIRED or SINGLE -# 20. PAIRED_FASTQ, Name of mate pair file if exists (Runs with failed mates will have -# a library layout of PAIRED but no paired fastq file) -# 21. WITHDRAWN, 0/1 to indicate if the file has been withdrawn, only present if a file has been withdrawn -# 22. WITHDRAWN_DATE, date of withdrawal, this should only be defined if a file is -# withdrawn -# 23. COMMENT, comment about reason for withdrawal -# 24. READ_COUNT, read count for the file -# 25. BASE_COUNT, basepair count for the file -def countBases(samples, seqIndex): - total = 0 - - for project, sampleID, withdrawnP, bases in flatFileIterator(seqIndex, [3,9,20,24]): - if ( withdrawnP == "0" and useProject(project) and sampleID in samples ): - if OPTIONS.verbose: print project, sampleID, withdrawnP, bases - sample = samples[sampleID] - sample.rawBases += int(bases) - total += int(bases) - - printStatus(samples) - print 'Total raw bases', total - return total - -def printStatus(samples): - if OPTIONS.verbose: - for sample in samples.itervalues(): - print sample - -def findVariantEvalResults(key, file, type=str): - def capture1(line): - if key in line: - s = line.split() - return type(s[len(s)-1]) - else: - return None - - return [val for val in map(capture1, openMaybeGZ(file)) if val != None] - - -def getDBSNPRate(file): - if file != None: - key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]" - r = findVariantEvalResults(key, file, float) - if len(r) > 0: - return r[0] - else: - return -1 - else: - return -1 - -def useProject(project): - #print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project) - return OPTIONS.project == None or re.match(OPTIONS.project, project) != None - -def countMappedBases(samples, alignmentIndex): - if ( OPTIONS.coverageFile != None ): - # read from summary file, looking for the line: - # Total 340710 1187.14 N/A N/A N/A - for parts in map( string.split, openMaybeGZ(OPTIONS.coverageFile) ): - if parts[0] == "Total": - return -1, int(parts[1]) - else: - return readMappedBasesFromBAS(samples, alignmentIndex) - -def readMappedBasesFromBAS(samples, alignmentIndex): - totalBases = 0 - totalMapped = 0 - - for project, sampleID, basFile in flatFileIterator(alignmentIndex, [2,3,6]): - #print project, sampleID, basFile - if ( useProject(project) and sampleID in samples ): - if OPTIONS.verbose: print project, sampleID, basFile - sample = samples[sampleID] - - for rawBases, mappedBases in flatFileIterator(os.path.join(OPTIONS.root, basFile), [7, 8], skip=1): - #print ' ->', rawBases, mappedBases - if OPTIONS.rawBasesFromBas: - sample.rawBases += int(rawBases) - totalBases += int(rawBases) - sample.mappedBases += int(mappedBases) - totalMapped += int(mappedBases) - #print ' totals', totalBases, totalMapped - - printStatus(samples) - print 'Total raw bases', totalBases - print 'Total mapped bases', totalMapped - - return totalBases, totalMapped - -def countSNPs(samples, snpsVCF, useIndels = False): - total = 0 - novel = 0 - - header, columnNames, remainingLines = vcfReader.readVCFHeader(openMaybeGZ(snpsVCF)) - sampleIDs = columnNames[9:] - - print 'Counting SNPs...' - #lines = 0 - for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False): - #lines += 1 - #print 'lines', lines - if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ): - break - - if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']: - if ( vcf.isVariant() ): - total += 1 - - if ( vcf.isNovel() ): novel += 1 - - if ( OPTIONS.verbose and total % 10000 == 0 ): - print ' progress', vcf.getChrom(), vcf.getPos() - - genotypes = vcf.rest[1:] - for sampleID, genotypeField in itertools.izip(sampleIDs, genotypes): - #print sampleID, samples - if sampleID in samples: - genotype = genotypeField.split(':')[0] - variant = genotype != "0/0" and genotype != "0|0" and genotype != "0\0" and genotype != "./." - #print ' => ', vcf, sampleID, genotype, variant - if variant: - if ( useIndels ): - samples[sampleID].nIndels += 1 - else: - samples[sampleID].nSNPs += 1 - else: - print 'rejecting line', vcf - - printStatus(samples) - return total, novel - -def countIndels(samples, indelsVCF): - total = 0 - - if ( indelsVCF != None ): - return countSNPs(samples, indelsVCF, True) - - return total, 0 - -def readSamples(vcf): - print 'Reading samples for', OPTIONS.population - header, columnNames, remainingLines = vcfReader.readVCFHeader(openMaybeGZ(vcf)) - samples = map(Sample, columnNames[9:]) - if ( OPTIONS.onlySample != None ): - samples = filter( lambda x: x.getName() == OPTIONS.onlySample, samples ) - - print 'No. samples: ', len(samples) - print 'Samples: ', map(Sample.getName, samples) - - return dict(map( lambda x: (x.getName(), x), samples)) - -if __name__ == "__main__": - usage = "usage: %prog" - parser = OptionParser(usage=usage) - parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex",type='string', default=None, help="1KG formated alignment index file") - parser.add_option("-s", "--sequenceIndex", dest="sequenceIndex", type='string', default=None, help="1KG formated sequence index file") - parser.add_option("", "--onlySample", dest="onlySample", type='string', default=None, help="If provide, only this sample will be processed") - parser.add_option("", "--snps", dest="snps", type='string', default=None, help="SNPs VCF") - parser.add_option("", "--snpsEval", dest="snpsVE", type='string', default=None, help="SNPs VCF VariantEval") - parser.add_option("", "--indels", dest="indels", type='string', default=None, help="Indels VCF") - parser.add_option("", "--indelsEval", dest="indelsVE", type='string', default=None, help="Indels VCF VariantEval") - parser.add_option("", "--totalGenome", dest="totalGenome", type='float', default=2.96e9, help="Size, in bp, of the callable genome") - parser.add_option("", "--calledGenome", dest="calledGenome", type='float', default=None, help="Size, in bp, of the callable genome") - parser.add_option("-p", "--pop", dest="population", type='string', default="Anonymous", help="Population") - parser.add_option("", "--project", dest="project", type='string', default=None, help="If provided, will only include fastq/BAM files that match this project in the stats calculations") - parser.add_option("-r", "--root", dest="root",type='string', default=".", help="Path to the 1KG data") - parser.add_option("-M", "--maxRecords", dest="maxRecords", type='int', default=-1, help="If provided, will only include fastq/BAM files that match this regex in the stats calculations") - parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="If provided, will be verbose during output") - parser.add_option("", "--rawBasesFromBas", dest="rawBasesFromBas", action='store_true', default=False, help="If provided, we'll take our raw base counts from the BAS file") - parser.add_option("-o", "--output", dest="output",type='string', default=None, help="Path to the 1KG data") - parser.add_option("-c", "--coverageFile", dest="coverageFile",type='string', default=None, help="Path to GATK DoC .sample_summary file") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 0: - parser.error("incorrect number of arguments") - - samples = readSamples(OPTIONS.snps) - nSamples = len(samples) - - ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex) - totalBases = countBases(samples, OPTIONS.sequenceIndex) - meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1)) - totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps) - totalIndels, novelIndels = countIndels(samples, OPTIONS.indels) - - snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE) - indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE) - - out = sys.stdout - if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w') - print >> out, 'number of samples', nSamples - print >> out, 'total raw bases', totalBases - print >> out, 'total mapped bases', totalMappedBases - - # mean mapped depth is total bases mapped divided by acgt reference base count divided by number of individuals, after rmdup: for exons this is calculated on the target region only - - print >> out, 'mean mapped depth', meanMappedDepth - - print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome) - print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate) - print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues())) - print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate) - print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues())) - out.close() - diff --git a/python/AnnotateVCFwithMAF.py b/python/AnnotateVCFwithMAF.py deleted file mode 100755 index 07378a0c9..000000000 --- a/python/AnnotateVCFwithMAF.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python - -import sys, FlatFileTable, os - -if sys.argv < 3: - print "Usage: AnnotateVCFwithMAF.py VCF_file MAF_file" - sys.exit() - -vcf_filename = sys.argv[1] -maf_filename = sys.argv[2] - -maf_gen = FlatFileTable.record_generator(maf_filename, "\t") - -headers=["gene","type","transcript","strand","genomechange","cDNAchange","codonchange","proteinchange"] - -loci_and_info = [] - -for record in maf_gen: - #print record - #info_string = ",".join(["%s=%s" % (header, record[header]) for header in headers]) - info_string = "" - for index,header in enumerate(headers): - if record.has_key(header): - if index > 0: - info_string += ";" - info_string += "%s=%s" % (header, record[header]) - - locus = record["chr"]+":"+record["start"] - - #print locus, info_string - loci_and_info.append((locus, info_string)) - -#vcf_gen = FlatFileTable.record_generator(vcf_file, "\t", 34) -vcf_file = open(vcf_filename) -vcf_out_file = open(os.path.splitext(os.path.basename(vcf_filename))[0]+".maf_annotated.vcf", "w") -vcf_format_line = vcf_file.readline() -vcf_out_file.write(vcf_format_line) -if vcf_format_line != "##fileformat=VCFv3.3\n" and vcf_format_line != "##fileformat=VCFv4.0": - print ("VCF not v 3.3 or v4.0") - sys.exit() - -header = vcf_file.readline() -while header != "" and header.startswith("#"): - if header.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"): - break - vcf_out_file.write(header) - header = vcf_file.readline() - -header_fields = header -if not header_fields.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"): - print ("VCF header fields not in expected order") - print header_fields - sys.exit() - -vcf_out_file.write("##source=AnnotateVCFwithMAF\n") -for header_field in headers: - vcf_out_file.write("##INFO="+header_field+",1,String,"+header_field+"\n") -vcf_out_file.write(header_fields) - -def addFormat(infoString): - # takes MAF info string and reformats values for usefulness and parseablity - newItems = list() - for item in infoString.split(";"): - keyval = item.split("=") - key = keyval[0] - val = keyval[1] - if key == "codonchange" : - # has the form c.(232-234)CAC>AAC - # want to strip to just the change - codon_change = val.split(")")[1] - numbers = val.split(".")[1].split(")")[0]+")" - newItems.append("codonchange="+codon_change+";codonoffset="+numbers) - if key == "proteinchange" : - # has the form p.H78N - # want to move to H>N - first = val.split(".")[1][0] - last = val[len(val)-1] - num = val.split(".")[1][1:len(val.split(".")[1])-1] - newItems.append("proteinchange="+first+">"+last+";proteinoffset="+num) - if key == "type" : - newItems.append(item) - - return ";".join(newItems) - -for vcf_line, locus_and_info in zip(vcf_file.readlines(), loci_and_info): - vcf_line_fields = vcf_line.split("\t") - vcf_locus = vcf_line_fields[0]+":"+vcf_line_fields[1] - #print record - maf_locus, maf_info = locus_and_info - if maf_locus != vcf_locus: - print "ERROR: VCF and MAF loci did not match" - sys.exit() - - vcf_line_fields[7] = vcf_line_fields[7]+";"+addFormat(maf_info) - new_vcf_line = "\t".join(vcf_line_fields) - vcf_out_file.write(new_vcf_line) diff --git a/python/JobDispatcher.py b/python/JobDispatcher.py deleted file mode 100644 index 171a3e525..000000000 --- a/python/JobDispatcher.py +++ /dev/null @@ -1,443 +0,0 @@ -import os -import sys -import subprocess -import time -import re -import unittest -import tempfile -import RefseqLibrary - -MAX_UNNAMED_DEPENDENCIES = 0 -class FarmJob: - def __init__( self, cmd_str_from_user, jobName, projectName, delayTime = None, jobsDependingOnThis = None, - outputFile = None, dependencies = [], dependencyNameString = None, dieOnFail = False, usingFiles = [], memory = None): - self.cmd_str_from_user = cmd_str_from_user - self.jobName = jobName - self.projectName = projectName - self.jobsDependingOnThis = jobsDependingOnThis - self.outputFile = outputFile - self.dieOnFail = dieOnFail - self.delayTime = delayTime - self.dependencies = dependencies - if self.dependencies == None: - self.dependencies = [] - elif type(self.dependencies) != list: - self.dependencies = [self.dependencies] - self.dependencyNameString = dependencyNameString - self.filesToUse = usingFiles ## provides additional protection for stop-resume spawning by keeping a list of files - - if len(self.dependencies) > MAX_UNNAMED_DEPENDENCIES: - depNames = map(FarmJob.getJobName, self.dependencies) - if len(filter(None, depNames)) > 1 and len(self.dependencies) != len(filter(None, depNames)): - # there are some unnamed and some named deps - raise Exception("Bad job names -- some are named and some are unnamed", depName) - - self.jobID = None # None indicates currently unscheduled - self.executionString = None # currently unscheduled - self.executed = False - self.jobStatus = None - self.memory = memory.strip("g") - - def __hash__(self): - return self.cmd_str_from_user.__hash__() - - def setDelay(self,delayString): - self.delayTime = delayString - - def getJobName(self): - return self.jobName - - def getJobIDString(self): - if self.jobName == None: - if self.jobID == None: - return "UNNAMED" - else: - return str(self.jobID) - else: - return self.getJobName() - - def __str__(self): - return "[JOB: name=%s id=%s depending on (%s) with cmd=%s]" % (self.getJobName(), self.jobID, ','.join(map(FarmJob.getJobIDString, self.dependencies)), self.cmd_str_from_user) - - def getNumberOfDependencies(self): - return len(self.dependencies) - - def getNumberOfDependers(self): - if ( self.jobsDependingOnThis == None ): - return 0 - else: - return len(jobsDependingOnThis) - -def compareDependence(job1,job2): - if ( job1.getJobName() in job2.dependencies ): - return 1 - elif ( job2.getJobName() in job1.dependencies ): - return -1 - else: - return job1.getNumberOfDependers() - job2.getNumberOfDependers() - -class JobDispatchError(Exception): - - def __init__(self,value): - self.value = value - - def __str__(self): - return repr(self.value) - -def delayToGlobalTime(delayStr): - # delayStr is of the form 1:2:3 for 1 day 2 hrs 3 minutes - dayHrMin = delayStr.split(":") - additionalSecs = 24*60*60*int(dayHrMin[0]) + 60*60*int(dayHrMin[1]) + 60*int(dayHrMin[2]) - futureTime = time.localtime(time.time()+additionalSecs) - return ":".join([str(futureTime[0]),str(futureTime[1]),str(futureTime[2]),str(futureTime[3]),str(futureTime[4])]) - -def buildDependencyString(jobList): - dep = '\'' - paddedList = list() - for name in jobList: - paddedList.append('ended("'+name+'")') - dep += " && ".join(paddedList)+'\'' - return dep - -def buildSubmitString(farmJob,queue): - submitStr = "bsub -q "+queue - if ( farmJob.jobName != None ): - submitStr += " -J "+farmJob.jobName - if ( farmJob.projectName != None ): - submitStr += " -P "+farmJob.projectName - if ( farmJob.outputFile != None ): - submitStr += " -o "+farmJob.outputFile - if ( farmJob.delayTime != None ): - submitStr += " -b "+delayToGlobalTime(farmJob.delayTime) - if ( farmJob.dependencies != [] ): - submitStr += " -w "+buildDependencyString(farmJob.dependencies) - if ( farmJob.memory != None ): - submitStr += " -R \"rusage[mem="+farmJob.memory+"]\"" - submitStr += " "+farmJob.cmd_str_from_user - return submitStr - -def writeResumeFile(jobList,start,max,filepath,specialHash = None): - print("Writing start was: "+str(start+max)) - try: - output = open(filepath,'w') - printToFile = True - except IOError: - print("Unable to open resume file, "+filepath+" dumping to stdout") - printToFile = False - hashVal = 0 - if ( specialHash != None ): - hashFunction = specialHash - else: - hashFunction = hash - for i in range(start,len(jobList)): - hashVal = hashVal ^ hashFunction(jobList[i]) - if ( printToFile ): - output.write(str(start+max)+"\n") - output.write(str(hashVal)+"\n") - output.write(jobList[start].cmd_str_from_user) - else: - print(str(start)) - print(str(hashVal)) - print(jobList[start].cmd_str_from_user) - -def writeResumeFileFinal(filepath): - output = open(filepath,'w') - output.write("ALL_JOBS_HAVE_BEEN_SPAWNED") - output.close() - -def checkResumeFile(filepath,jobs,specialHash = None): - if ( filepath == None or not os.path.exists(filepath) ): - return 0 - else: - input = open(filepath) - firstLine = input.readline() - if ( firstLine == "ALL_JOBS_HAVE_BEEN_SPAWNED" ): - raise JobDispatchError("All jobs for this project were spawned.") - else: - start = int(firstLine) - print("reading start was "+str(start)) - hashValToMatch = int(input.readline()) - input.close() - if ( specialHash == None ): - hashFunction = hash - else: - hashFunction = specialHash - hashVal = 0 - for i in range(start,len(jobs)): - hashVal = hashVal ^ hashFunction(jobs[i]) - if ( hashVal == hashValToMatch ): - return start - else: - str1="The commands for remaining jobs hashed to "+str(hashVal)+" but previous hash was "+str(hashValToMatch)+"." - str2="Please check the resume file "+filepath+" to see that the job command has not changed." - raise JobDispatchError(str1+" "+str2) - -def hashJobAndIntervals(farmJob): - fjhash = farmJob.__hash__() - interval_file = farmJob.filesToUse - intervals = list() - for line in open(interval_file[0]): - if ( not line.startswith("@") ): - spline = line.strip().split() - try: - intervals.append(Interval(spline[0],int(spline[1]),int(spline[2]))) - except IndexError: - print(line) - raise IndexError("List index out of range") - inthash = 0 - for interval in intervals: - inthash = inthash ^ interval.__hash__() - return fjhash ^ inthash - -class JobDispatcher: - def __init__(self,lsf_queues = ["long"], queue_limits = dict([["long",500]]), exceed_total_limit_action = "fail", print_only = False, action_string = None): - self.queues = lsf_queues - self.limits = queue_limits - self.action = exceed_total_limit_action - self.action_string = action_string - self.spawned = list() - if ( len(queue_limits) > 0 ): - self.maxJobs = sum(queue_limits.values()) - else: - self.maxJobs = -1 - self.print_only = print_only - self.startDays = 0 - self.startHours = 0 - self.startMins = 0 - - def setInitialDelay(self,delay): - dhm = delay.split(":") - self.startDays = int(dhm[0]) - self.startHours = int(dhm[1]) - self.startMins = int(dhm[2]) - - # external accessor, sorts by dependency and ensures user hasn't exceeded the set limits - def dispatchAll(self,farmJobs): - farmJobs.sort(compareDependence) - if ( self.maxJobs > -1 and len(farmJobs) > self.maxJobs ): - if ( self.action == "space" and self.action_string != None): - self._dispatchWithSpacing(farmJobs) - elif ( self.action == "resume" and self.action_string != None ): - self._dispatchWithStopResume(farmJobs) - else: - raise JobDispatchError("Number of jobs to dispatch, "+str(len(farmJobs))+", exceeds maximum ("+str(self.maxJobs)+").") - else: - self._dispatchAll(farmJobs) - - # internal accessor, loops over queues and dispatches jobs up to the limit - def _dispatchAll(self,farmJobs): - for queue in self.queues: - dispatchedToQueue = 0 - while ( dispatchedToQueue < self.limits[queue] and len(farmJobs) > 0 ): - farmJob = farmJobs.pop(0) - self._dispatch(farmJob,queue) - self.spawned.append(farmJob) - dispatchedToQueue = 1 + dispatchedToQueue - - # internal dispatch accessor; job limits dealt with at this point - def _dispatch(self,job,queue): - if ( self.print_only ): - print(buildSubmitString(job,queue)) - else: - lsf_response = subprocess.Popen([buildSubmitString(job,queue), ""], shell=True, stdout=subprocess.PIPE).communicate()[0] - p = re.compile('Job <(\d+)> is submitted to queue') - job.jobID = p.match(lsf_response).group(1) - print(lsf_response) - - # spaces jobs out by using the delay command to LSF - def _dispatchWithSpacing(self,farmJobs): - dayHrMin = self.action_string.split(":") - days = self.startDays - hours = self.startHours - mins = self.startMins - dispatchBuffer = list() - while ( len(farmJobs) > 0 ): - nextJob = farmJobs.pop(0) - nextJob.setDelay(":".join([str(days),str(hours),str(mins)])) - dispatchBuffer.append(nextJob) - if ( len(dispatchBuffer) >= self.maxJobs or len(farmJobs) == 0 ): - self._dispatchAll(dispatchBuffer) - days += int(dayHrMin[0]) - hours += int(dayHrMin[1]) - mins += int(dayHrMin[2]) - dispatchBuffer = list() - - # dispatches jobs up to the limit, outputs a file describing where to resume again - # if the file already exists, resumes at that job - # file also contains a hashcode for the remaining jobs -- if these don't match the - # hashcode calculated from farmJobs, dispatching is aborted via an exception - def _dispatchWithStopResume(self,farmJobs,jobHash = None): - startAt = checkResumeFile(self.action_string,farmJobs,jobHash) - dispatchBuffer = list() - while ( len(dispatchBuffer) < self.maxJobs and len(farmJobs) > startAt ): - dispatchBuffer.append(farmJobs.pop(startAt)) - self._dispatchAll(dispatchBuffer) - if ( len(farmJobs) > startAt ): - writeResumeFile(farmJobs,startAt,self.maxJobs,self.action_string,jobHash) - else: - writeResumeFileFinal(self.action_string) -class Interval: - def __init__(self,chrom,start,stop): - self.chromosome = chrom - self.start = start - self.stop = stop - - def size(self): - return self.stop - self.start - - def bedFormat(self): - return "\t".join([self.chromosome,str(self.start),str(self.stop),"+","target_whatever"]) - - def __str__(self): - return self.chromosome+":"+str(self.start)+"-"+str(self.stop) - - def __hash__(self): - return self.__str__().__hash__() - -class GATKDispatcher(JobDispatcher): - - def __init__(self,jarfile,memory,walker,args,output_directory,reference = None, bams = None, intervals = None, - queues = ["long"], limits = dict([["long",500]]), print_only = False, action = "fail", delay = None): - self.jarfile = jarfile - self.memory = memory - self.walker = walker - self.args = args - self.reference = reference - self.bams = bams - self.intervals = intervals - self.outputDir = output_directory - self.project = "GSA_GATK_Analysis" - if ( action == "resume" ): - action_string = self.outputDir+"GATKDispatcher/resumeJobs.txt" - elif ( action == "space" ): - action_string = delay - else: - action_string = None - JobDispatcher.__init__(self,queues,limits,action,print_only,action_string) - self.check() - self.baseCommand = "java -Xmx"+self.memory+" -jar "+self.jarfile+" -T "+self.walker+" "+args - - def check(self): - if ( not os.path.exists(self.jarfile) ): - raise JobDispatchError("The provided GATK jarfile "+str(self.jarfile)+" does not exist") - if ( self.intervals != None and not os.path.exists(self.intervals) ): - raise JobDispatchError("The provided interval list file, "+str(self.intervals)+" does not exist.") - if ( self.bams != None and not os.path.exists(self.bams) ): - raise JobDispatchError("The provided bam, or bam list, "+str(self.bams)+" does not exist.") - if ( self.reference == None ): - print("Warning: No reference supplied to GATKDispatcher. Most analyses require a reference.") - - def setProject(self,newProject): - self.project = newProject - - def addReadFilter(self,filter,filter_args = None): - if ( filter_args == None ): - self.baseCommand += " -rf "+filter - else: - raise JobDispatchError("GATK Dispatcher does not yet support read filters with arguments (e.g. blacklisting)") - - # remove bam files, reference, intervals, read filters, etc - def resetCommand(self): - self.baseCommand = "java -Xmx"+self.memory+" -jar "+self.jarfile+" -T "+self.walker+" "+args - - def dispatchByInterval(self,base_limit): - self.check() - dispatchCommand = self.baseCommand + " -R "+self.reference - if ( self.bams != None ): - dispatchCommand += " -I "+self.bams - intervals = open(self.intervals) - job_number = 0 - bases_for_job = 0 - intervals_for_job = list() - headerLines = list() - farmJobs = list() - if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ): - os.mkdir(self.outputDir+"GATKDispatcher/") - for line in intervals: - if ( line.startswith("@") ): - headerLines.append(line) - else: - spline = line.strip().split() - chrom = spline[0] - start = int(spline[1]) - stop = int(spline[2]) - interval = Interval(chrom,start,stop) - intervals_for_job.append(interval) - bases_for_job += interval.size() - if ( bases_for_job > base_limit ): - farmJobs.append(self._buildIntervalJob(job_number,headerLines,intervals_for_job,dispatchCommand)) - intervals_for_job = list() - bases_for_job = 0 - job_number += 1 - if ( len(intervals_for_job) > 0 ): ## there's still some leftover - farmJobs.append(self._buildIntervalJob(job_number,headerLines,intervals_for_job,dispatchCommand)) - self.dispatchAll_Interval(farmJobs) - - def dispatchAll_Interval(self,jobs): - if ( self.action == "resume" ): - self._dispatchWithStopResume(jobs,hashJobAndIntervals) - else: - self.dispatchAll(jobs) - - def _buildIntervalJob(self,num,header,intervals,command): - job_dir = self.outputDir+"GATKDispatcher/dispatch"+str(num)+"/" - cmd = self.appendOutput(command,num,job_dir) - if ( not os.path.exists(job_dir) ): - os.mkdir(job_dir) - intFile = open(job_dir+"job"+str(num)+"_intervals.interval_list",'w') - intFile.write("".join(header)) - for interval in intervals: - intFile.write(interval.bedFormat()+"\n") - intFile.close() - cmd += " -L "+job_dir+"job"+str(num)+"_intervals.interval_list" - job = FarmJob(cmd,self.project+"_job"+str(num),self.project,None,None,job_dir+"bsub_out.txt",list(),None,False,[job_dir+"job"+str(num)+"_intervals.interval_list"],self.memory) - return job - - def appendOutput(self,command,num,job_dir): - if ( self.walker == "UnifiedGenotyper" ): - cmdToDispatch = command + " -varout "+job_dir+"job"+str(num)+"_calls.vcf" - elif ( self.walker == "CoverageStatistics" or self.walker == "DepthOfCoverage" ): - cmdToDispatch = command + " -o "+job_dir+"job"+str(num) - elif ( self.walker == "CombineDuplicates" or self.walker == "TableRecalibration" or self.walker == "ClipReads" ): - cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+"_output.bam" - else: - cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt" - return cmdToDispatch - - def dispatchByGene(self, geneNames): - self.genes = RefseqLibrary.getRefseqGenes(geneNames) - dispatchCommand = self.baseCommand + " -R "+self.reference - farmJobs = list() - jobNumber = "" - headerLines = RefseqLibrary.getIntervalHeaderLines() - - if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ): - os.mkdir(self.outputDir+"GATKDispatcher/") - if ( self.bams != None ): - dispatchCommand += " -I "+self.bams - - for gene in self.genes: - jobNumber = "_"+gene.getGeneName() - intervals = gene.getExonIntervals() - farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand)) - - self.dispatchAll_Interval(farmJobs) - - def dispatchByTargetDesign(self,designFile): - self.genes = RefseqLibrary.parseDesignFile(designFile) - dispatchCommand = self.baseCommand + " -R "+self.reference - farmJobs = list() - jobNumber = "" - headerLines = RefseqLibrary.getIntervalHeaderLines() - - if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ): - os.mkdir(self.outputDir+"GATKDispatcher/") - if ( self.bams != None ): - dispatchCommand += " -I "+self.bams - - for gene in self.genes: - jobNumber = "_"+gene.getGeneName() - intervals = gene.getExonIntervals() - farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand)) - - self.dispatchAll_Interval(farmJobs) diff --git a/python/JobDispatcherExample.py b/python/JobDispatcherExample.py deleted file mode 100644 index 81fc3dcc1..000000000 --- a/python/JobDispatcherExample.py +++ /dev/null @@ -1,23 +0,0 @@ -import os -import JobDispatcher - -PRINT_ONLY = True -QUEUES = ["gsa","short","long"] -LIMITS = dict([["gsa",50],["short",20],["long",50]]) -ACTION = "space" -TIME_DIFF = "0:1:0" # 1 hour -RESULTS_DIR = "/humgen/gsa-hpprojects/dev/kiran/wholeExomeGeneCoverage/scratch/" -GATK_JAR = "/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar" -MEMORY = "2g" -WALKER = "CoverageStatistics" -REFERENCE = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" -BAM_FILES = RESULTS_DIR+"bam_files_use.list" -INTERVAL_LIST = "/seq/references/HybSelOligos/whole_exome_agilent_designed_120/whole_exome_agilent_designed_120.targets.interval_list" -MAX_BASES_PER_JOB = 100000 -ARGUMENTS = "-mmq 20 -mbq 10 -omitBaseOutput -dels -l INFO" - -dispatcher = JobDispatcher.GATKDispatcher(GATK_JAR,MEMORY,WALKER,ARGUMENTS,RESULTS_DIR,REFERENCE, - BAM_FILES,INTERVAL_LIST,QUEUES,LIMITS,PRINT_ONLY,ACTION,TIME_DIFF) - -dispatcher.setProject("GSA_WholeExome_CoverageByExon") -dispatcher.dispatchByInterval(MAX_BASES_PER_JOB) diff --git a/python/MPGQueuePipelineStatus.py b/python/MPGQueuePipelineStatus.py deleted file mode 100644 index 5c3707cb8..000000000 --- a/python/MPGQueuePipelineStatus.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import glob -import time - -#takes a path to a yaml and defines the project and a number of descriptive features -class status: - def __init__(self, yaml): - self.yaml = yaml - self.project = os.path.basename(self.yaml).split(".")[0] - self.directory = os.path.dirname(self.yaml) - self.version = self.directory.split("/")[4].split("v")[1] - self.dirkey = self.directory.split("/")[3] - if len(glob.glob(self.directory + "/SnpCalls/*.pdf")) >= 1: - self.edate=max([os.path.getmtime(i) for i in glob.iglob(self.directory + "/SnpCalls/*.pdf")]) - self.status = "In Review" - elif len(glob.glob(self.directory + "/*/*.vcf")) >= 5: - self.edate=max([os.path.getmtime(i) for i in glob.iglob(self.directory + "/*/*.vcf")]) - self.status= "Eval" - else: - self.edate=max([os.path.getmtime(i) for i in glob.iglob(self.directory)]) - self.status= "Calling" - self.date = time.strftime("%a %b %d %H:%M",time.localtime(self.edate)) - - -class update: - def __init__(self): - self.projects = glob.iglob('/humgen/gsa-pipeline/*/*/*/*.yaml') - self.updates = [] - for each in self.projects: - Update = status(each) - self.updates.append(Update) - self.updates=sorted(self.updates, key=lambda update: update.edate) - print '{0:60} {1:15} {2:20} {3:7}'.format("Project (version)","status","date","dirkey") # waht is this expecting for these valuse? - for s in self.updates: - print '{0:60} {1:15} {2:20} {3:7}'.format(s.project+ " ("+ s.version + ")", s.status, s.date, s.dirkey) - -if __name__ == "__main__": - go = update() diff --git a/python/ParseDCCSequenceData.py b/python/ParseDCCSequenceData.py deleted file mode 100755 index 074da5c01..000000000 --- a/python/ParseDCCSequenceData.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python - -import operator, FlatFileTable - -class db_file: - filename = "" - - def __init__(self, filenm): - self.filename = filenm - - def count_fields(self, fixed_field, fixed_field_values, count_field): - record_gen = FlatFileTable.record_generator(self.filename) - - counts = dict() - #fixed_field_num = self.field_names[fixed_field] - print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values) - - for record in record_gen: - if record[fixed_field] in fixed_field_values: - #fixed_field_value = fields[fixed_field_num] - count_field_num = record[count_field] - if counts.has_key(count_field_num): - counts[count_field_num] += 1 - else: - counts[count_field_num] = 0 - - for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ): - print str(k)+"\t"+str(v) - - def count_bases(self, fixed_field_values): #, fixed_field_values): - record_gen = FlatFileTable.record_generator(self.filename) - - base_count = 0 - #fixed_field_num = self.field_names[fixed_field] - #print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":", - print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] ) - - for record in record_gen: - #if record[fixed_field] in fixed_field_values: - if FlatFileTable.record_matches_values(record, fixed_field_values): - try: - base_count += int(record["BASE_COUNT"]) - except ValueError: - pass - - print "%e bases" % base_count - -if __name__ == "__main__": - db = db_file("sequence.index") - - platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",)) - studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",)) - - for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)): - for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"): - for select_field_value in select_field_values: - db.count_fields(select_field, select_field_value, count_field) - - for select_field_value in select_field_values: - db.count_bases(((select_field, select_field_value),)) - - print - - for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms): - for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies): - db.count_bases(((field1, value1), (field2, value2))) - - - - - - - - - diff --git a/python/RefseqLibrary.py b/python/RefseqLibrary.py deleted file mode 100755 index f2e897714..000000000 --- a/python/RefseqLibrary.py +++ /dev/null @@ -1,311 +0,0 @@ -import math -def chrFormat(chr): - if ( chr == "chr0"): - return "chrM" - if ( chr == "chr23" ): - return "chrX" - if ( chr == "chr24" ): - return "chrY" - else: - return chr - -def chr2int(chr): - try: - chr = chr.split("chr")[1] - if ( chr == "M" ): - return 0 - if ( chr == "X" ): - return 23 - if ( chr == "Y" ): - return 24 - return int(chr) - except IndexError: - print("Index error: "+chr) - return -1 - -def intervalCompare(int1,int2): - chr1 = chr2int(int1.chromosome) - chr2 = chr2int(int2.chromosome) - if ( chr1 < chr2 ): - return -1 - elif ( chr1 > chr2 ): - return 1 - else: - start1 = int1.start - start2 = int2.start - return start1 - start2 - -class Interval: - def __init__(self,chrom,start,stop): - self.chromosome = chrom - self.start = start - self.stop = stop - if ( chrom == "NONE" ): - self.isEmpty = True - else: - self.isEmpty = False - self.basesCovered = None - - def size(self): - return self.stop - self.start - - def overlaps(self,other): - if ( self.chromosome == other.chromosome ): - if ( other.stop < self.stop and not other.start < self.start ): - return True - if ( other.stop > self.start and not other.start > self.stop ): - return True - return False - - def intersect(self,other): - if ( self.overlaps(other) ): - return Interval(self.chromosome, max(self.start,other.start), min(self.stop,other.stop)) - else: - return Interval("NONE",-1,-1) - - def isBefore(self,other): - if ( chr2int(self.chromosome) < chr2int(other.chromosome) ): - return True - elif ( chr2int(self.chromosome) > chr2int(other.chromosome) ): - return False - else: - if ( other.start > self.stop ): - return True - return False - - def bedFormat(self): - return self.chromosome+"\t"+str(self.start)+"\t"+str(self.stop)+"\t+\ttarget_x" - - def __str__(self): - return self.chromosome + ":" + str(self.start) + "-" + str(self.stop) - - def __cmp__(self,other): - return intervalCompare(self,other) - -class CoveredInterval(Interval): - def __init__(self,chrom,start,stop): - Interval.__init__(self,chrom,start,stop) - self.overlappingSubIntervals = list() - - def updateCoverage(self,other): - if ( other.overlaps(self) ): - self.overlappingSubIntervals.append(other) - - def getBaseCoverage(self): - if ( self.basesCovered is None ): - basesCovered = 0 - intersects = list() - self.overlappingSubIntervals.sort(intervalCompare) - for overlap in self.overlappingSubIntervals: - ival = self.intersect(overlap) - intersects.append(ival) - for i in range(len(intersects)): - basesCovered = basesCovered + intersects[i].size() - for j in range(i+1,len(intersects)): - basesCovered = basesCovered - (intersects[i].intersect(intersects[j])).size() - - self.basesCovered = basesCovered - return self.basesCovered - - def getOverlappingIntervals(self): - return self.overlappingSubIntervals - -class Exon: - def __init__(self,geneName,exonid,chrom,start,stop): - #print("Adding exon for "+geneName+" with "+chrom+":"+str(start)+"-"+str(stop)) - self.interval = CoveredInterval(chrom,start,stop) - self.gene = geneName - self.id = exonid - - def getOverlappingIntervals(self): - return self.interval.getOverlappingIntervals() - - def updateCoverage(self,target): - self.interval.updateCoverage(target) - - def overlaps(self,target): - return self.interval.overlaps(target) - - def isBefore(self,target): - return self.interval.isBefore(target) - - def getBaseCoverage(self): - return self.interval.getBaseCoverage() - - def size(self): - return self.interval.size() - - def getBedEntry(self): - return "\t".join([chrFormat(self.interval.chromosome),str(self.interval.start),str(self.interval.stop),self.gene,self.id,str(self.getCoverageProportion())]) - - def getCoverageProportion(self): - if ( self.size() > 0 ): - return float(self.getBaseCoverage())/float(self.size()) - else: - return -1 - - def __str__(self): - return self.gene+"("+self.id+") "+str(self.interval) - - def getInterval(self): - return self.interval - -class Gene: - def __init__(self,name): - self.name = name - self.exons = list() - - def addExon(self,exon): - self.exons.append(exon) - - def size(self): - size = 0 - for exon in self.exons: - size = size + exon.size() - return size - - def getExonIntervals(self): - intervals = list() - for exon in self.exons: - intervals.append(exon.getInterval()) - return intervals - - def getBaseCoverage(self): - coverage = 0 - for exon in self.exons: - coverage = coverage + exon.getBaseCoverage() - return coverage - - def __str__(self): - exonString = list() - for exon in self.exons: - exonString.append(str(exon)) - return self.name+"\t"+"\t".join(exonString) - - def getGeneName(self): - return self.name - - def setGeneName(self,newName): - self.name = newName - -class ExonRecord(Exon): - def __init__(self,geneName,exonid,chrom,start,stop,prop): - Exon.__init__(self,geneName,exonid,chrom,start,stop) - self.coverageProportion = prop - self.baseCoverage = math.ceil(prop*self.size()) - self.records = list() - - def getBaseCoverage(self): - return self.baseCoverage - - def getCoverageProportion(self): - return self.coverageProportion - - def addRecord(self,record): - self.records.append(record) - - def getData(self): - toRet = "" - for record in self.records: - toRet += record.dataString()+"\t" - return toRet -class CoverageRecord: - def __init__(self,chrom,start,stop,sampleName,mean,median,q1,q3): - self.interval = Interval(chrom,start,stop) #indexing issues - self.sampleName = sampleName - self.mean = mean - self.median = median - self.q1 = q1 - self.q3 = q3 - - def dataString(self): - return "\t".join([self.sampleName,str(self.mean),str(self.median),str(self.q1),str(self.q3)]) - - def getInterval(self): - return self.interval - -def getRefseqGenes(names): - names = list(names) - refGene = open("/humgen/gsa-hpprojects/GATK/data/refGene.sorted.txt") - refSeq = open("/humgen/gsa-hpprojects/GATK/data/refseq/hg18.ref_gene.cds.bed") - refSeqGeneNames = list() - refNamesToAltNames = dict() - for name in names: - if ( name.startswith("NM_") ): - refSeqGeneNames.append(name) - refNamesToAltNames[name]=name - - if ( len(names) > 0 ): - for line in refGene.readlines(): - spline = line.strip().split("\t") - altName = spline[len(spline)-4] - if ( altName in names ): - if ( not ( altName in refNamesToAltNames.values() ) ): - refSeqGeneNames.append(spline[1]) - refNamesToAltNames[spline[1]]=altName - else: - print("WARNING: multiple transcripts found for gene "+altName+" using first available transcript from refseq export") - - if ( len(names) > len(refSeqGeneNames) ): - for g in refSeqGeneNames: - if ( refNamesToAltNames[g] in names ): - names.remove(refNamesToAltNames[g]) - - raise ValueError("No entry found for genes: "+str(names)) - - # build up the gene list - genes = dict() - for geneName in refSeqGeneNames: - genes[geneName] = Gene(geneName) - - for line in refSeq.readlines(): - spline = line.strip().split("\t") - geneName = spline[3].split("_cds")[0] - if ( geneName in refSeqGeneNames ): - chrom = spline[0] - start = int(spline[1]) - stop = int(spline[2]) - id = "cds_"+spline[3].split("_cds_")[1].split("_")[0] - genes[geneName].addExon(Exon(geneName,id,chrom,start,stop)) - - toReturn = list() - for gene in genes.values(): - gene.setGeneName(refNamesToAltNames[gene.getGeneName()]) - toReturn.append(gene) - return toReturn - - -def getIntervalHeaderLines(): - whole_exome_file = open("/humgen/gsa-hpprojects/GATK/data/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.hg18.interval_list") - header = list() - line = whole_exome_file.readline() - while ( line.startswith("@") ): - header.append(line) - line = whole_exome_file.readline() - whole_exome_file.close() - return header - -def parseDesignFile(file): - designFile = open(file) - genes = dict() - for line in designFile.readlines(): - if ( line.startswith("TARGET") ): - spline = line.strip().split() - if ( spline[1].startswith("chr") ): - chrom = spline[1] - else: - chrom = "chr"+spline[1] - start = 1 + int(spline[2]) - stop = 1 + int(spline[3]) - gene_name = spline[4].split("#")[1].split("_")[0] - try: - exon_id = spline[4].split(gene_name)[1] - except IndexError: - exon_id = "_".join(spline[5:len(spline)-1]) - exon = Exon(gene_name,exon_id,chrom,start,stop) - if ( gene_name in genes.keys() ): - genes[gene_name].addExon(exon) - else: - genes[gene_name] = Gene(gene_name) - genes[gene_name].addExon(exon) - return genes.values() diff --git a/python/RunPilot2Pipeline.py b/python/RunPilot2Pipeline.py deleted file mode 100755 index ccf885177..000000000 --- a/python/RunPilot2Pipeline.py +++ /dev/null @@ -1,211 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from gatkConfigParser import * -import glob -import itertools - -if __name__ == "__main__": - usage = """usage: %prog [-c config.cfg]*""" - - parser = OptionParser(usage=usage) - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-c", "--config", dest="configs", - action="append", type="string", default=[], - help="Configuration file") - parser.add_option("-w", "--wait", dest="initialWaitID", - type="string", default=None, - help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=False, - help="Ignores already written files, if present") - parser.add_option("-d", "--dir", dest="outputdir", - type="string", default="./", - help="Output directory") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 0: - parser.error("incorrect number of arguments") - - config = gatkConfigParser(OPTIONS.configs) - - if not os.path.exists(OPTIONS.outputdir): - os.mkdir(OPTIONS.outputdir) - - def outputDir(suffix): - return os.path.join(OPTIONS.outputdir, suffix) - -# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5: -# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240 -# 454: 36 18 -# SLX: 82 91 70 56 68 86 -# SOLID: 37 64 -# 454+SLD: 64 77 -# ALL: 138 150 - DoC_454 = {"NA12878":36,"NA19240":18} - DoC_slx = {"NA12878":82,"NA12891":91,"NA12892":70,"NA19238":56,"NA19239":68,"NA19240":86} - DoC_solid = {"NA12878":37,"NA19240":64} - DoC_454solid = {"NA12878":64,"NA19240":77} - DoC_all = {"NA12878":138,"NA19240":150} - DoC_hash = {"454":DoC_454,"SLX":DoC_slx,"SOLID":DoC_solid,"454SOLID":DoC_454solid,"ALL":DoC_all} - MQ_hash = {"SLX":100,"SOLID":5,"454":5,"454SOLID":10,"ALL":110} - - intervals_dir = outputDir("intervals") - cleaner_output = outputDir("cleaner") - injector_output = outputDir("bams") - snp_output = outputDir("calls/unfiltered_snps") - filter_output = outputDir("calls/filtered_snps") - indel_output = outputDir("calls/indels") - final_bam_dir = outputDir("useTheseBamsForAnalyses") - #final_bam_dir = "/humgen/gsa-hphome1/projects/1kg_pilot2/useTheseBamsForAnalyses" - - samples = ["NA12878","NA12891","NA12892","NA19238","NA19239","NA19240"] - techs = ["SLX"] - chrs = range(1, 23) + ["X"] - - for sample in samples: - # - # Actually do some work here - # - def finalBam(tech): - return os.path.join(final_bam_dir, "%s.%s.bam" % ( sample, tech )) - def outputFileTech(root, tech, name): - return os.path.join(root, "%s.%s.%s" % ( sample, tech, name )) - def badSnps( tech ): - return os.path.join(cleaner_output, "%s.%s.realigner.badsnps" % ( sample, tech )) - def indelsForFiltering( tech ): - return outputFileTech(indel_output, tech, "low.calls") - - myTechs = techs - if sample in ["NA12878", "NA19240"]: - myTechs = techs + ["SOLID","454"] - - for tech in myTechs: - - if ( tech != "454" ): - myChrs = chrs - if sample in ["NA12891", "NA19239"]: - myChrs = chrs + ["Y"] - def badSnpsChr( tech, chr ): - return os.path.join(cleaner_output, "%s.chr%s.%s.realigner.badsnps" % ( sample, chr, tech )) - - def makeJobName(suffix): - return sample + "." + tech + "." + suffix - def makeJobClass(suffix): - return sample + ".*." + suffix - - for chr in myChrs: - - bam = "/broad/1KG/DCC/ftp/pilot_data/%s/alignment/%s.chrom%s.%s.SRP000032.2009_07.bam" % ( sample, sample, chr, tech ) - - def outputFile(root, name): - return os.path.join(root, "%s.chr%s.%s.%s" % ( sample, chr, tech, name )) - def MismatchIntervals(bam, outputFile, intervals): - return config.gatkCmd('MismatchIntervals') + " -o " + outputFile + " -L " + intervals + " -I " + bam - def IndelIntervals(bam, outputFile, intervals): - return config.gatkCmd('IndelIntervals') + " -o " + outputFile + " -L " + intervals + " -I " + bam - def MergeIntervals(bam, files, outputFile, intervals): - return config.gatkCmd('IntervalMerger') + " -o " + outputFile + " ".join(map( lambda x: " -intervals " + x, files )) + " -L " + intervals + " -I " + bam - def CleanIntervals(bam, outputFile, intervals, snpfile): - return config.gatkCmd('IntervalCleaner') + " -O " + outputFile + " -L " + intervals + " -I " + bam - def Injector(bam, outputFile, intervals, inputfile): - return config.gatkCmd('CleanedReadInjector') + " --output_bam " + outputFile + " -L " + intervals + " -I " + bam + " --cleaned_reads " + inputfile - - jobid = None - - mismatchIntervalsFile = outputFile(intervals_dir, "mismatches.intervals") - cmd = MismatchIntervals(bam, mismatchIntervalsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, mismatchIntervalsFile, just_print_commands = OPTIONS.dry, waitID = None, jobName = makeJobName("phase1." + str(chr))) - - indelIntervalsFile = outputFile(intervals_dir, "indels.intervals") - cmd = IndelIntervals(bam, indelIntervalsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelIntervalsFile, just_print_commands = OPTIONS.dry, waitID = None, jobName = makeJobName("phase1." + str(chr))) - - mergedIntervalsFile = outputFile(intervals_dir, "merged.intervals") - cmd = MergeIntervals(bam, [mismatchIntervalsFile, indelIntervalsFile], mergedIntervalsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, mergedIntervalsFile, just_print_commands = OPTIONS.dry, waitID = makeJobName("phase1." + str(chr))) - - cleanedFile = outputFile(cleaner_output, "bam") - badsnpsFile = badSnpsChr(tech, str(chr)) - cmd = CleanIntervals(bam, cleanedFile, mergedIntervalsFile, badsnpsFile) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, cleanedFile, just_print_commands = OPTIONS.dry, waitID = jobid) - injectedFile = outputFile(injector_output, "bam") - cmd = Injector(bam, injectedFile, str(chr), cleanedFile) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile, just_print_commands = OPTIONS.dry, waitID = jobid) - - cmd = "samtools index " + injectedFile - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile + ".bai", just_print_commands = OPTIONS.dry, waitID = jobid, jobName = makeJobName("phase2")) - - def MergeBams(outputFile): - return "MergeBAMBatch.py -d " + cleaner_output + " -q " + OPTIONS.farmQueue + " -s '" + outputFile + "\t" + os.path.join(cleaner_output, "%s.chr*.%s.bam" % ( sample, tech )) + "'" - - cmd = MergeBams(finalBam(tech)) - mergeJobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, finalBam(tech), just_print_commands = OPTIONS.dry, waitID = makeJobName("phase2"), jobName = makeJobName("phase3")) - - cmd = "cat " - for chr in myChrs: - cmd = cmd + " " + badSnpsChr(tech, chr) - cmd = cmd + " > " + badSnps(tech) - badsnpsJobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, badSnps(tech), just_print_commands = OPTIONS.dry, waitID = makeJobName("phase2"), jobName = makeJobName("phase4")) - - def IndelCaller(bam, outputFile, fraction): - return config.gatkCmd('IndelGenotyper') + " -O " + outputFile + " -I " + bam + " -minFraction " + fraction - def SnpCaller(bam, outputFile): - return config.gatkCmd('SingleSampleGenotyper') + " -varout " + outputFile + " -I " + bam - def VarFiltration(bam, outputHead, snpcalls, badsnps, indelcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSNP," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:max=" + depth + " -X MappingQualityZero:max=" + mq - def VarFiltration454(bam, outputHead, snpcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + " -X DepthOfCoverage:max=" + depth + " -X MappingQualityZero:max=" + mq - - waitid = makeJobName("phase3") - if ( tech == "454" ): - waitid = None - - bamToCallFrom = finalBam(tech) - indelsFileHigh = outputFileTech(indel_output, tech, "high.calls") - cmd = IndelCaller(bamToCallFrom, indelsFileHigh, "0.3") - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileHigh, just_print_commands = OPTIONS.dry, waitID = waitid) - - indelsFileLow = indelsForFiltering(tech) - cmd = IndelCaller(bamToCallFrom, indelsFileLow, "0.1") - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileLow, just_print_commands = OPTIONS.dry, waitID = waitid, jobName = makeJobName("phase4")) - - snpsFile = outputFileTech(snp_output, tech, "calls") - cmd = SnpCaller(bamToCallFrom, snpsFile) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = waitid, jobName = makeJobName("phase4")) - - varFiltFile = os.path.join(filter_output, "%s.%s" % ( sample, tech )) - if ( tech != "454" ): - cmd = VarFiltration(bamToCallFrom, varFiltFile, snpsFile, badSnps(tech), indelsFileLow, str(DoC_hash[tech][sample]), str(MQ_hash[tech])) - else: - cmd = VarFiltration454(bamToCallFrom, varFiltFile, snpsFile, str(DoC_hash[tech][sample]), str(MQ_hash[tech])) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = makeJobName("phase4")) - - def SnpCaller(bams, outputFile): - return config.gatkCmd('SingleSampleGenotyper') + " -varout " + outputFile + " ".join(map( lambda x: " -I " + x, bams )) - def VarFiltration(bams, outputHead, snpcalls, badsnps, indelcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSNP," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:max=" + depth + " -X MappingQualityZero:max=" + mq + " ".join(map( lambda x: " -I " + x, bams )) - - if sample in ["NA12878", "NA19240"]: - - solid454SnpsFile = outputFileTech(snp_output, "454-SOLID", "calls") - cmd = SnpCaller([finalBam("SOLID"),finalBam("454")], solid454SnpsFile) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454SnpsFile, just_print_commands = OPTIONS.dry, waitID = makeJobClass("phase3")) - - solid454VarFiltFile = os.path.join(filter_output, "%s.454-SOLID" % ( sample )) - cmd = VarFiltration([finalBam("SOLID"),finalBam("454")], solid454VarFiltFile, solid454SnpsFile, badSnps("SOLID"), indelsForFiltering("SOLID"), str(DoC_hash["454SOLID"][sample]), str(MQ_hash["454SOLID"])) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454VarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) - - allSnpsFile = outputFileTech(snp_output, "allTechs", "calls") - cmd = SnpCaller([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], allSnpsFile) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allSnpsFile, just_print_commands = OPTIONS.dry, waitID = makeJobClass("phase3")) - allVarFiltFile = os.path.join(filter_output, "%s.allTechs" % ( sample )) - cmd = VarFiltration([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], allVarFiltFile, allSnpsFile, badSnps("SLX"), indelsForFiltering("SLX"), str(DoC_hash["ALL"][sample]), str(MQ_hash["ALL"])) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allVarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) - diff --git a/python/SyzygyCallsFileToVCF.py b/python/SyzygyCallsFileToVCF.py deleted file mode 100755 index 824756372..000000000 --- a/python/SyzygyCallsFileToVCF.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python -#import farm_commands -import subprocess -import os -import sys -import re -import time -import math - -dbsnp = "." # can be global. Let something else annotate dbsnp info -filter = "0" # don't declare any filtering in the VCF file - -#print(sys.argv) - -raw_calls_file = open(sys.argv[1]) -output_vcf_file = open(sys.argv[2],'w') -pool_name = [] - -try: - pool_name = sys.argv[3] -except IndexError: - # parse the file name - filepath = sys.argv[0].strip().split("/") - pool_name = filepath[len(filepath)] - - -header = raw_calls_file.readline().strip() - -fields = header.split() - -#print(fields) - -# parse it for important offsets -try: - chrompos = fields.index("chr:offset") - ref_offset = fields.index("ref_base") - for_a_index = fields.index("A") - for_c_index = fields.index("C") - for_g_index = fields.index("G") - for_t_index = fields.index("T") - for_d_index = fields.index("D") - for_i_index = fields.index("I") - for_depth_index = fields.index("sum") - rev_a_index = fields.index("AR") - rev_c_index = fields.index("CR") - rev_g_index = fields.index("GR") - rev_t_index = fields.index("TR") - rev_d_index = fields.index("DR") - rev_i_index = fields.index("IR") - rev_depth_index = fields.index("sumr") - total_depth_index = fields.index("combined_sum") - lod_score_index = fields.index("combined_lod") - call_index = fields.index("allele2") - fish_flag = fields.index("flag") - fish_pval_index = fields.index("fisher-pval") -except ValueError: - print("ERROR: no index for a header field in call file. Initial line was") - print(header) - -# now print the VCF header -head1 = "##source=Syzygy" -head2 = "##format=VCRv3.2" -fields = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",pool_name] -head3 = "#" + "\t".join(fields) - -output_vcf_file.write(head1+"\n") -output_vcf_file.write(head2+"\n") -output_vcf_file.write(head3+"\n") - -def getProportionNonrefThatAreSNP(list): - ref_base = list[ref_offset] - a_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1]) - c_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1]) - g_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1]) - t_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1]) - if ( ref_base == "A" ): - snp_base_counts = max(c_bases,g_bases,t_bases) - nonref_counts = c_bases+g_bases+t_bases - elif ( ref_base == "C"): - snp_base_counts = max(a_bases,g_bases,t_bases) - nonref_counts = a_bases+g_bases+t_bases - elif ( ref_base == "G"): - snp_base_counts = max(a_bases,c_bases,t_bases) - nonref_counts = a_bases+c_bases+t_bases - else: - snp_base_counts = max(a_bases,c_bases,g_bases) - nonref_counts = a_bases+c_bases+g_bases - - return ( float(snp_base_counts + 1)/float(nonref_counts+1) ) - -def getProportionNonref(list): - total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1]) - ref_base = list[ref_offset] - if ( ref_base == "A" ): - ref_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1]) - elif ( ref_base == "C"): - ref_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1]) - elif ( ref_base == "G"): - ref_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1]) - else: - ref_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1]) - - return 1.0 - ( float(ref_bases+1) / float(total_bases+1) ) - -def generateVCFLine(chrom, pos, db, ref, alt, filt, qual, INFO): - # make the info into a single string - infoString = "" - for keval in INFO: - info = "=".join(keval) - infoString = infoString+info - format = "GT:GQ" - genotype = "0/1:"+qual - all_fields = [chrom, pos, db, ref, alt, qual, filt, infoString, format, genotype] - return "\t".join(all_fields) - -# instantiate a line buffer - -two_lines_ago = []; -previous_line = []; -this_line = []; -next_line = []; -line_after_next = []; - -# read through the file - -for line in raw_calls_file.readlines(): - spline = line.strip().split() - - # iterate through the lines - - two_lines_ago = previous_line - previous_line = this_line - this_line = next_line - next_line = line_after_next - line_after_next = spline - - # window has been updated - - if ( not two_lines_ago ): - continue # window not filled yet - else: - try: - is_potential_call = float(this_line[lod_score_index]) > 1 and this_line[call_index] != "D" and this_line[call_index] != "I" - except IndexError as err: - print(this_line) - print(lod_score_index) - raise IndexError(str(err)) - except ValueError as err: - print(this_line) - print(lod_score_index) - raise(ValueError(str(err))) - if ( is_potential_call): - # potential call - chrom = this_line[chrompos].split(":")[0] - pos = this_line[chrompos].split(":")[1] - ref = this_line[ref_offset] - alt = this_line[call_index] - lod = this_line[lod_score_index] - - # standard vcf info made -- now add INFO - # syzy depth - syz_depth = this_line[total_depth_index] - # syzy strand bias - syz_sb = "" - if ( this_line[fish_flag] == "NA" ): - syz_sb = "-1" - else: - syz_sb = this_line[fish_pval_index] - - # do we want any other kind of annotations here ?? - - # syzy neighborhood mismatch rate - pileup_noise = str(1.0-getProportionNonrefThatAreSNP(this_line)) - next_mmr = getProportionNonref(next_line) # from later in genome - after_next_mmr = getProportionNonref(line_after_next) # from same - prev_mmr = getProportionNonref(previous_line) # from earlier - before_last_mmr = getProportionNonref(two_lines_ago) # same - syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr) - # turn these into key value pairs - INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr],[";syzy_residMismatch",pileup_noise],[";syzy_originalLOD",lod]] # semicolons are a complete hack - # get the vcf line - vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, str(float(lod)*10.0), INFO) - # print the sucker - output_vcf_file.write(vcfLine+"\n") diff --git a/python/VCFValidationAnalysis.py b/python/VCFValidationAnalysis.py deleted file mode 100644 index db662b2d4..000000000 --- a/python/VCFValidationAnalysis.py +++ /dev/null @@ -1,235 +0,0 @@ -# VCF Validation Analysis library -- classes and functions to help VCF validation - -header = ["Name","Filtered","Called","False Positive","2+ Alleles","Discordant","True Positives","Now not called singleton"] - -vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] - -def COUNT_ALLELES(vcf_fields): - al = 0 - for i in range(len(vcf_fields) ): - if ( i > 8 ): - gt = vcf_fields[i].split(":")[0] - #print(gt) - if ( gt.find("1") != -1 ): - al = 1 + al - - return al - -class VCFGenotypeValidator: - def __init__(self,name): - self.name = name - self.chipHets = set() - self.chipHoms = set() - self.chipRefs = set() - self.chipNoCalls = set() - self.concordantGenotypes = set() - self.falseNegativeGenotypes = set() - self.falsePositiveGenotypes = set() - self.homsCalledHets = set() - self.hetsCalledHoms = set() - self.header = "INITIALIZED" - - def getFalsePositives(self): - return self.falsePositiveGenotypes - - def setHeader(self, header): - try: - self.offset = header.index(self.name) - self.header = header - except ValueError: - print("Index error -- name is "+self.name) - self.offset = -1 - - def importData(self,splitVCFLine): - chrpos = splitVCFLine[0]+":"+splitVCFLine[1] - genotype = splitVCFLine[self.offset].split(":")[0] - if ( genotype == "1/1" ): - self.chipHoms.add(chrpos) - elif ( genotype == "0/0" ): - self.chipRefs.add(chrpos) - elif ( genotype.startswith(".") ): - self.chipNoCalls.add(chrpos) - else: - self.chipHets.add(chrpos) - - def checkOffset(self): - if ( self.header[self.offset] != self.name ): - raise ValueError("Header[offset] is not name") - - def checkOffset(self,newHeader): - if ( newHeader[self.offset] != self.name ): - raise ValueError("The internal offset is not appropriate for the input header.") - - def checkData(self, splitVCFLine): - chrpos = splitVCFLine[0]+":"+splitVCFLine[1] - genotype = splitVCFLine[self.offset].split(":")[0] - if ( chrpos in self.chipRefs ): - self.checkFalsePositive(chrpos,genotype) - self.chipRefs.remove(chrpos) - elif ( chrpos in self.chipHets ): - self.checkChipHet(chrpos,genotype) - self.chipHets.remove(chrpos) - elif ( chrpos in self.chipHoms ): - self.checkChipHom(chrpos,genotype) - self.chipHoms.remove(chrpos) - # else ignore the site - - def checkFalsePositive(self,chrpos,genotype): - if ( genotype.count("1") != 0 ): - self.falsePositiveGenotypes.add(chrpos) - - def checkChipHet(self,chrpos,genotype): - if ( genotype.count("1") == 2 ): - self.hetsCalledHoms.add(chrpos) - elif ( genotype.count("1") == 0 ): - self.falseNegativeGenotypes.add(chrpos) - else: - self.concordantGenotypes.add(chrpos) - - def checkChipHom(self,chrpos,genotype): - if ( genotype.count("1") == 2 ): - self.concordantGenotypes.add(chrpos) - elif ( genotype.count("1") == 0 ): - self.falseNegativeGenotypes.add(chrpos) - else: - self.homsCalledHets.add(chrpos) - - def __str__(self): - false_negs = str(len(self.falseNegativeGenotypes)) - false_pos = str(len(self.falsePositiveGenotypes)) - hetsCalledHoms = str(len(self.hetsCalledHoms)) - homsCalledHets = str(len(self.homsCalledHets)) - concordant = str(len(self.concordantGenotypes)) - variants_called_only_in_chip = str(len(self.chipHets)+len(self.chipHoms)) - out_fields = [self.name, concordant, homsCalledHets, hetsCalledHoms, false_pos, false_negs, variants_called_only_in_chip] - return "\t".join(out_fields) - - -class VCFSingletonValidator: - - def __init__(self, infokeys, name): - self.name = name - self.infokeys = infokeys - self.numCalls = 0 - self.concordantTPCalls = 0 - self.numFalsePositives = 0 - self.numSingletonsActuallyDoublePlus = 0 - self.filteredCalls = 0 - self.falsePositives = list() - self.wrongAF = list() - self.allSites = list() - - def update(self, vcfLine, args="NONE"): - if ( self.useLine(vcfLine) ): - self.numCalls = 1 + self.numCalls - fields = vcfLine.strip().split("\t") - self.allSites.append(fields[0]+":"+fields[1]) - info = fields[7] - info = info.split(";") - infodict = dict() - for pair in info: - keyval = pair.split("=") - infodict[keyval[0]]=keyval[1] - ref = fields[3] - alt = fields[4] - numAlleles = int(infodict["nonrefAlleles"]) - snpName = infodict["snpID"] - call = snpName.split("_g")[1].split("_")[0].upper() - if ( numAlleles == 0 ): - self.numFalsePositives = 1 + self.numFalsePositives - self.falsePositives.append(fields[0]+":"+fields[1]) - else: - if ( numAlleles > 1 ): - if ( args == "CHECK_NEW_CALL" ): - contig = fields[0]+":"+fields[1] - newcallAlleles = self.calledCounts.get(contig) - if ( int(newcallAlleles > 1) ): - self.originalSingletonsNowCalledHigher = 1 + self.originalSingletonsNowCalledHigher - else: - self.numSingletonsActuallyDoublePlus = 1 + self.numSingletonsActuallyDoublePlus - self.wrongAF.append(fields[0]+":"+fields[1]) - else: - self.numSingletonsActuallyDoublePlus = 1 + self.numSingletonsActuallyDoublePlus - self.wrongAF.append(fields[0]+":"+fields[1]) - if ( call.find(alt) != -1 ): - self.concordantTPCalls = 1 + self.concordantTPCalls - elif ( alt.find(",") == -1 ): - print("Discordant site at "+fields[0]+":"+fields[1]+" with call "+call+"and alt "+alt) - else: - print("Tri allelic site at "+fields[0]+":"+fields[1]) - - def useLine(self, vcfLine): - for key in self.infokeys: - if ( vcfLine.find(key) == -1 ): - return False - - if ( vcfLine.split("\t")[6] != "." ): - self.filteredCalls = 1 + self.filteredCalls - return False - - return True - - def __str__(self): - entries = [self.name, str(self.filteredCalls), str(self.numCalls), str(self.numFalsePositives), - str(self.numSingletonsActuallyDoublePlus), - str(self.numCalls-self.concordantTPCalls-self.numFalsePositives),str(self.concordantTPCalls)] - return ("\t".join(entries)) - - def falsePositiveSites(self): - return "\n".join(self.falsePositives) - - def wrongAFSites(self): - return "\n".join(self.wrongAF) - - def printAllSites(self): - return "\n".join(self.allSites) - -class ContigFilteredValidator(VCFSingletonValidator): - # only need a couple of things - def __init__(self,keyset,name,alleledict): - VCFSingletonValidator.__init__(self,keyset,name) - self.calledCounts = alleledict - self.originalSingletonsNowCalledHigher = 0 - - def useLine(self, vcfLine): - spline = vcfLine.strip().split("\t") - contig = spline[0]+":"+spline[1] - if ( contig in self.infokeys): - if ( vcfLine.split("\t")[6] != "." ): - self.filteredCalls = 1 + self.filteredCalls - return False - else: - return True - else: - return False - - def update(self, vcfLine): - VCFSingletonValidator.update(self,vcfLine,"CHECK_NEW_CALL") - - def __str__(self): - fields = VCFSingletonValidator.__str__(self).split("\t") - fields.append(str(self.originalSingletonsNowCalledHigher)) - return "\t".join(fields) - -class SingletonExclusionValidator(VCFSingletonValidator): - # override the useLine function -- must have the first - # key but no others - def __init__(self,keyset,name): - VCFSingletonValidator.__init__(self,keyset,name) - - def useLine(self,vcfLine): - if ( vcfLine.find(self.infokeys[0]) == -1 ): - return False - else: - for key in self.infokeys: - if ( key != self.infokeys[0] and vcfLine.find(key) != -1 ): - return False - - if ( vcfLine.split("\t")[6]!="." ): - self.filteredCalls = 1 + self.filteredCalls - return False - else: - return True - - def update(self, vcfLine): - VCFSingletonValidator.update(self,vcfLine) diff --git a/python/Verify1KGArchiveBAMs.py b/python/Verify1KGArchiveBAMs.py deleted file mode 100644 index 394e4eb2b..000000000 --- a/python/Verify1KGArchiveBAMs.py +++ /dev/null @@ -1,307 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import itertools -from urlparse import urlparse -from ftplib import FTP -import MergeBAMsUtils -import time -import re -import hashlib - - -FTPSERVER = None -DEBUG = False - -CACHED_LIST = dict() # from directories to lists of lines - -class Status: - def __init__(self, file, exists, size): - self.file = file - self.exists = exists - self._size = size - - if not exists: self.status = "missing" - if size == 0: self.status = "no-data" - else: self.status = "exists: bytes=" + str(self.size()) - - def __str__(self): - return self.file + " " + self.status - - def size(self): - return self._size - - def viewSize(self): - return MergeBAMsUtils.greek(self.size()) - - -def md5(file): - m = hashlib.md5() - for line in open(file): - m.update(line) - return m.hexdigest() - -class ComparedFiles: - def __init__(self, file, status, localStat, ftpStat): - self.file = file - self.status = status - self.localStat = localStat - self.ftpStat = ftpStat - - def size(self): - if self.localStat.size() <> 0: - return self.localStat.size() - if self.ftpStat.size() <> 0: - return self.ftpStat.size() - else: - return 0 - - def modTime(self): - if self.localStat.exists: - return os.path.getmtime(self.localStat.file) - else: - return 0 - -def modTimeStr(t): - if t == 0: - return 'N/A' - else: - return time.strftime("%m/%d/%y", time.localtime(int(t))) - -def getSizeForFile(dir, filename): - global CACHED_LIST - size = [0] - def captureSize(line, cache = True): - #print line - if cache: CACHED_LIST[dir].append(line) - s = line.split() - if len(s) >= 9 and s[8] == filename: - size[0] = int(s[4]) - #print 'Found size', s, size - - if dir in CACHED_LIST: - #print 'cached is', CACHED_LIST[dir] - map( lambda l: captureSize(l, False), CACHED_LIST[dir] ) - else: - FTPSERVER.cwd(dir) - CACHED_LIST[dir] = list() - result = FTPSERVER.retrlines('LIST', captureSize) - - return size[0] - -def ftpStatus( ftpPath ): - if DEBUG: print 'ftpPath', ftpPath - dir, filename = os.path.split(ftpPath) - if DEBUG: print 'listing', dir - - try: - size = getSizeForFile(dir, filename) - except: - #print 'failing...' - size = 0 -# finally: -# pass - #print 'FTPSERVER', FTPSERVER - #FTPSERVER.quit() - - if DEBUG: print ' result was', size - return Status( ftpPath, size <> 0, size ) - -def fetchFtpFile( file ): - filename = os.path.split(file)[1] - destFile = filename + '.fetched.' + date.today().strftime("%m_%d_%y") - #print 'destFile', destFile - fd = open(destFile, 'w') - result = FTPSERVER.retrbinary('RETR ' + file, lambda x: fd.write(x)) - fd.close() - #print "done" - return Status(destFile, True, os.path.getsize(destFile)) - -def localStatus(file): - exists = os.path.exists(file) - size = 0 - if exists: size = os.path.getsize(file) - return Status(file, exists, int(size) ) - -def validateFile(relPath, localRoot, ftpRoot): - localPath = os.path.join(root, relPath) - ftpPath = os.path.join(ftpRoot, relPath) - - # check the local file - if DEBUG: print 'Checking', relPath - localStat = localStatus(localPath) - ftpStat = ftpStatus(ftpPath) - if DEBUG: print ' local status is', localStat - if DEBUG: print ' ftp status is ', ftpStat - compared = compareFileStatus(localStat, ftpStat) - - if not OPTIONS.quiet: - print 'STATUS %20s for %s ' % (compared.status, relPath) - return compared - -def compareFileStatus(localStat, ftpStat): - if DEBUG: print 'comparing', localStat, ftpStat - if localStat.exists: - if ftpStat.exists: - if localStat.size() == ftpStat.size(): - status = 'in-sync' - else: - status = 'size-mismatch' - else: - status = 'unknown-local-file' - else: - if ftpStat.exists: - status = 'local-file-missing' - else: - status = 'orphaned-file' - - return ComparedFiles(localStat.file, status, localStat, ftpStat) - - -def filesInLocalPath(root, subdir): - regex = re.compile(".*\.(bam|bai)$") - localFiles = set() - - if subdir <> None: - for fullroot, dirs, files in os.walk(os.path.join(root, subdir)): - for file in filter( regex.match, files ): - #if file <> "NA12761.SLX.WUGSC.Mosaik.SRP000033.2009_08.bam.bai": - # continue - fullpath = os.path.join(fullroot, file) - path = fullpath.split(root)[1] - #print 'adding relpath=', path, 'fullpath=', fullpath - localFiles.add(path) - if OPTIONS.maxLocalFiles <> None and len(localFiles) > OPTIONS.maxLocalFiles: return localFiles - return localFiles - -def readAlignmentIndex(file): - files = set() - if file <> None: - for line in open(file): - parts = line.split() - files.add(parts[0]) - if len(parts) > 4: # we have an index, we're not unmapped - files.add(line.split()[4]) - return files - -def compareAlignmentIndices(remoteAlignmentIndex, alignmentIndex): - if remoteAlignmentIndex <> None and alignmentIndex <> None: - printHeaderSep() - print 'Comparing remote and local alignment indices: ' - remotePath = os.path.join(ftpParsed[2], remoteAlignmentIndex) - remoteAlignmentIndexFile = fetchFtpFile( remotePath ) - print ' Fetched', remotePath, 'to', remoteAlignmentIndexFile.file - raImd5 = md5(remoteAlignmentIndexFile.file) - laImd5 = md5(alignmentIndex) - print ' md5s: local=%s remote=%s' % (raImd5, laImd5) - if raImd5 <> laImd5: - print ' [FAIL] -- alignment indices do not have the same hash!' - #sys.exit(1) - else: - print ' [PASS] -- alignment indices are the same' - return remoteAlignmentIndexFile.file - else: - return None - -def displayChangeLog( changelog ): - if changelog <> None: - printHeaderSep() - print 'Displaying remote changelog for examination ' - remotePath = os.path.join(ftpParsed[2], changelog) - remoteChangeLog = fetchFtpFile( remotePath ) - print ' Fetched', remotePath, 'to', remoteChangeLog.file - - print - for line in itertools.islice(open(remoteChangeLog.file), 20): - print 'CHANGELOG', line, - -def printHeaderSep(): - print - print ''.join(['-'] * 80) - -def sortByName(files): - return sorted(files, key=lambda x: x.file) - -if __name__ == "__main__": - usage = "usage: %prog -l and/or -a root ftpRoot" - parser = OptionParser(usage=usage) - parser.add_option("-l", "--local", dest="scanLocal", - type='string', default=None, - help="If provided, checks all of the local files against the archive") - parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex", - type='string', default=None, - help="If provided, checks all of the files in the alignment.index in the archive") - parser.add_option("-m", "--maxLocal", dest="maxLocalFiles", - type='int', default=None, - help="If provided, maximum number of files in the local archive to examine") - parser.add_option("-M", "--maxFiles", dest="maxFiles", - type='int', default=None, - help="If provided, maximum number of files in the local archive to examine") - parser.add_option("-q", "--quiet", dest="quiet", - action='store_true', default=False, - help="If provided, prints out the individual status of all files") - parser.add_option("-i", "--remoteAlignmentIndex", dest="remoteAlignmentIndex", - type='string', default=None, - help="relative path to the FTP's alignment.index file for comparison") - parser.add_option("-c", "--remoteChangeLog", dest="remoteChangeLog", - type='string', default=None, - help="relative path to the FTP's CHANGELOG file for display") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 2: - parser.error("incorrect number of arguments") - root, ftpRoot = args - - ftpParsed = urlparse(ftpRoot) - FTPSERVER = FTP(ftpParsed[1]) - FTPSERVER.login() - - displayChangeLog(OPTIONS.remoteChangeLog) - remoteAI = compareAlignmentIndices(OPTIONS.remoteAlignmentIndex, OPTIONS.alignmentIndex) - - results = dict() - AIToUse = OPTIONS.alignmentIndex - if remoteAI != None: - AIToUse = remoteAI - print 'Using remote AI', remoteAI, "for comparisons" - for file in itertools.chain(readAlignmentIndex(AIToUse), filesInLocalPath(root, OPTIONS.scanLocal )): - #print line - #bas = line.split()[6] - if file not in results: - compared = validateFile( file, root, ftpParsed[2] ) - results[file] = compared - #localIndex - if OPTIONS.maxFiles != None and len(results) > OPTIONS.maxFiles: - break - - printHeaderSep() - print 'SUMMARY: Total files examined', len(results) - for status in ['in-sync', 'size-mismatch', 'unknown-local-file', 'local-file-missing', 'orphaned-file']: - printHeaderSep() - filesOfStatus = sortByName(filter(lambda x: x.status == status, results.itervalues())) - n = len(filesOfStatus) - print 'SUMMARY: %s' % ( status ) - print 'SUMMARY: Files %d (%.2f%% of total)' % ( n, n * 100.0 / len(results)) - - statusForFileListing = ['size-mismatch', 'local-file-missing', 'orphaned-file'] - maxFilesToList = 20 - if status in statusForFileListing: - print 'SUMMARY: listing the first', min(maxFilesToList, n), 'of', n - for file in itertools.islice(filesOfStatus, maxFilesToList): - if status == 'size-mismatch': - print 'SUMMARY: File: ftp=%d bytes local=%d bytes %12s %s' % ( file.ftpStat.size(), file.localStat.size() , modTimeStr(file.modTime()), file.file) - else: - print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file) - if n > 0: - fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 )) - mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus ) + [0])) - - print 'SUMMARY: total size %s' % ( fileSizes ) - print 'SUMMARY: last modification time %s' % ( mostRecentMod ) - - totalSize = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, results.itervalues() ))) - print '#### TOTAL PROJECT SIZE: %s' % ( totalSize ) diff --git a/python/analyzeRunReports.py b/python/analyzeRunReports.py deleted file mode 100755 index fff4659e0..000000000 --- a/python/analyzeRunReports.py +++ /dev/null @@ -1,515 +0,0 @@ -import os.path -import sys -from optparse import OptionParser -from itertools import * -from xml.etree.cElementTree import * -import gzip -import datetime -import re - -MISSING_VALUE = "NA" -RUN_REPORT_LIST = "GATK-run-reports" -RUN_REPORT = "GATK-run-report" - -def main(): - global OPTIONS - usage = "usage: %prog [options] mode file1 ... fileN" - parser = OptionParser(usage=usage) - - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="If provided, verbose progress will be enabled") - - parser.add_option("", "--overwrite", dest="overwrite", - action='store_true', default=False, - help="If provided, archive mode will overwrite destination file, if it exists (DANGEROUS)") - - parser.add_option("-o", "--o", dest="output", - type='string', default=None, - help="if provided, output will go here instead of stdout") - - parser.add_option("", "--no-dev", dest="noDev", - action='store_true', default=False, - help="if provided, only records not coming from a dev version of GATK will be included") - - parser.add_option("", "--rev", dest="rev", - type="string", default=None, - help="if provided, only reports generated by this version of the GATK will be included") - - parser.add_option("-E", "", dest="exception_selection", - type='choice', choices=['all', 'user', 'sting'], default='all', - help="if provided, will only emit records matching of the provided class [default %default]") - - parser.add_option("", "--max_days", dest="maxDays", - type='int', default=None, - help="if provided, only records generated within X days of today will be included") - - parser.add_option("-D", "--delete_while_archiving", dest="reallyDeleteInArchiveMode", - action='store_true', default=False, - help="if provided, we'll actually delete records when running in archive mode") - - (OPTIONS, args) = parser.parse_args() - if len(args) == 0: - parser.error("Requires at least GATKRunReport xml to analyze") - - stage = args[0] - files = resolveFiles(args[1:]) - - # open up the output file - if OPTIONS.output != None: - if stage == "archive" and os.path.exists(OPTIONS.output) and not OPTIONS.overwrite: - raise "archive output file already exists, aborting!", OPTIONS.output - out = openFile(OPTIONS.output,'w') - else: - out = sys.stdout - - handler = getHandler(stage)(stage, out) - handler.initialize(files) - - # parse all of the incoming files - counter = 0 - for report in readReports(files): - # todo -- add matching here - handler.processRecord(report) - counter += 1 - report.clear() - - handler.finalize(files) - if OPTIONS.output != None: out.close() - print 'Processed records:', counter - -# -# Stage HANDLERS -# -class StageHandler: - def __init__(self, name, out): - self.name = name - self.out = out - - def getName(self): return self.name - - def initialize(self, args): - pass # print 'initialize' - - def processRecord(self, record): - pass # print 'processing record', record - - def finalize(self, args): - pass # print 'Finalize' - - -# a map from stage strings -> function to handle record -HANDLERS = dict() -def addHandler(name, handler): - HANDLERS[name] = handler - -def getHandler(stage): - return HANDLERS[stage] - -def eltIsException(elt): - return elt.tag == "exception" - -def parseException(elt): - msgElt = elt.find("message") - msgText = "MISSING" - userException = "NA" - if msgElt != None: msgText = msgElt.text - stackTrace = elt.find("stacktrace").find("string").text - if elt.find("is-user-exception") != None: - #print elt.find("is-user-exception") - userException = elt.find("is-user-exception").text - return msgText, stackTrace, userException - -def javaExceptionFile(javaException): - m = re.search("\((.*\.java:.*)\)", javaException) - if m != None: - return m.group(1) - else: - javaException - -class RecordDecoder: - def __init__(self): - self.fields = list() - self.formatters = dict() - - def id(elt): return elt.text - def toString(elt): return '%s' % elt.text - - def formatExceptionMsg(elt): - return '%s' % parseException(elt)[0] - - def formatExceptionAt(elt): - return '%s' % parseException(elt)[1] - - def formatExceptionAtBrief(elt): - return '%s' % javaExceptionFile(parseException(elt)[1]) - - def formatExceptionUser(elt): - return '%s' % parseException(elt)[2] - - def formatDomainName(elt): - if elt != None: - parts = elt.text.split(".") - if len(parts) >= 2: - return '.'.join(parts[-2:]) - else: - return 'unknown' - - def add(names, func): - for name in names: - addComplex(name, [name], [func]) - - def addComplex(key, fields, funcs): - self.fields.extend(fields) - self.formatters[key] = zip(fields, funcs) - - add(["id", "walker-name", "svn-version", "phone-home-type"], id) - add(["start-time", "end-time"], toString) - add(["run-time", "java-tmp-directory", "working-directory", "user-name"], id) - addComplex("host-name", ["host-name", "domain-name"], [id, formatDomainName]) - add(["java", "machine"], toString) - add(["max-memory", "total-memory", "iterations", "reads"], id) - addComplex("exception", ["exception-msg", "exception-at", "exception-at-brief", "is-user-exception"], [formatExceptionMsg, formatExceptionAt, formatExceptionAtBrief, formatExceptionUser]) - # add(["command-line"], toString) - - def decode(self, report): - bindings = dict() - for elt in report: - if elt.tag in self.formatters: - fieldFormats = self.formatters[elt.tag] - # we actually care about this tag - for field, formatter in fieldFormats: - bindings[field] = formatter(elt) - - # add missing data - for field in self.fields: - if field not in bindings: - bindings[field] = MISSING_VALUE - - return bindings - -# def -class RecordAsTable(StageHandler): - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - - def initialize(self, args): - self.decoder = RecordDecoder() - print >> self.out, "\t".join(self.decoder.fields) - - def processRecord(self, record): - parsed = self.decoder.decode(record) - - def oneField(field): - val = MISSING_VALUE - if field in parsed: - val = parsed[field] - if val == None: - if OPTIONS.verbose: print >> sys.stderr, 'field', field, 'is missing in', parsed['id'] - else: - val = val.replace('"',"'") -# if val.find("\t") != -1: -# if OPTIONS.verbose: print >> sys.stderr, 'Warning -- val', val, 'contains tabs, droping field', field -# raise Error - #val = "value contained tabs, dropped" - if val.find(" ") != -1: - val = "\"" + val + "\"" - return val - try: - print >> self.out, "\t".join([ oneField(field) for field in self.decoder.fields ]) - except: - #print 'Failed to convert to table ', parsed - pass - -addHandler('table', RecordAsTable) - -class CountRecords(StageHandler): - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - - def initialize(self, args): - self.counter = 0 - - def processRecord(self, record): - self.counter += 1 - -addHandler('count', CountRecords) - - -class RecordAsXML(StageHandler): - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - - def initialize(self, args): - print >> self.out, "<%s>" % RUN_REPORT_LIST - - def processRecord(self, record): - print >> self.out, tostring(record) - - def finalize(self, args): - print >> self.out, "" % RUN_REPORT_LIST - -addHandler('xml', RecordAsXML) - -class Archive(RecordAsXML): - def __init__(self, name, out): - RecordAsXML.__init__(self, name, out) - - def finalize(self, args): - RecordAsXML.finalize(self, args) - for arg in args: - if OPTIONS.verbose: print 'Deleting file: ', arg - if OPTIONS.reallyDeleteInArchiveMode: - os.remove(arg) - print 'Deleted', len(args), 'files' - -addHandler('archive', Archive) - -class ExceptionReport(StageHandler): - #FIELDS = ["Msg", "At", "SVN.versions", "Walkers", 'Occurrences', 'IDs'] - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - self.exceptions = [] - - def initialize(self, args): - self.decoder = RecordDecoder() - #print >> self.out, "\t".join(self.FIELDS) - - def processRecord(self, record): - for elt in record: - if eltIsException(elt): - self.exceptions.append(self.decoder.decode(record)) - break - - def finalize(self, args): - commonExceptions = list() - - def addToCommons(ex): - for common in commonExceptions: - if common.equals(ex): - common.update(ex) - return - commonExceptions.append(CommonException(ex)) - - for ex in self.exceptions: - addToCommons(ex) - commonExceptions = sorted(commonExceptions, None, lambda x: x.counts) - - for common in commonExceptions: - msg, at, svns, walkers, counts, ids, duration, users, userError = common.toStrings() - - if not matchesExceptionSelection(userError): - continue - - print >> self.out, ''.join(['*'] * 80) - print >> self.out, 'Exception :', msg - print >> self.out, ' is-user-exception? :', userError - print >> self.out, ' at :', at - print >> self.out, ' walkers :', walkers - print >> self.out, ' svns :', svns - print >> self.out, ' duration :', duration - print >> self.out, ' occurrences :', counts - print >> self.out, ' users :', users - print >> self.out, ' ids :', ids - - -def matchesExceptionSelection(userError): - if OPTIONS.exception_selection == "all": - return True - elif OPTIONS.exception_selection == "user" and userError == "true": - return True - elif OPTIONS.exception_selection == "sting" and userError == "false": - return True - return False - -class CommonException: - MAX_SET_ITEMS_TO_SHOW = 5 - - def __init__(self, ex): - self.msgs = set([ex['exception-msg']]) - self.at = ex['exception-at'] - self.svns = set([ex['svn-version']]) - self.users = set([ex['user-name']]) - self.userError = ex['is-user-exception'] - self.counts = 1 - self.times = set([decodeTime(ex['end-time'])]) - self.walkers = set([ex['walker-name']]) - self.ids = set([ex['id']]) - - def equals(self, ex): - return self.at == ex['exception-at'] - - def update(self, ex): - self.msgs.add(ex['exception-msg']) - self.svns.add(ex['svn-version']) - self.users.add(ex['user-name']) - self.counts += 1 - self.walkers.add(ex['walker-name']) - self.times.add(decodeTime(ex['end-time'])) - self.ids.add(ex['id']) - - def bestExample(self, examples): - def takeShorter(x, y): - if len(y) < len(x): - return y - else: - return x - return reduce(takeShorter, examples) - - def setString(self, s): - if len(s) > self.MAX_SET_ITEMS_TO_SHOW: - s = [x for x in s][0:self.MAX_SET_ITEMS_TO_SHOW] + ["..."] - return ','.join(s) - - def duration(self): - x = sorted(filter(lambda x: x != "ND", self.times)) - if len(x) >= 2: - return "-".join(map(lambda x: x.strftime("%m/%d/%y"), [x[0], x[-1]])) - elif len(x) == 1: - return x[0] - else: - return "ND" - - - def toStrings(self): - return [self.bestExample(self.msgs), self.at, self.setString(self.svns), self.setString(self.walkers), self.counts, self.setString(self.ids), self.duration(), self.setString(self.users), self.userError] - -addHandler('exceptions', ExceptionReport) - - - -class SummaryReport(StageHandler): - #FIELDS = ["Msg", "At", "SVN.versions", "Walkers", 'Occurrences', 'IDs'] - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - self.reports = [] - - def initialize(self, args): - self.decoder = RecordDecoder() - #print >> self.out, "\t".join(self.FIELDS) - - def processRecord(self, record): - self.reports.append(self.decoder.decode(record)) - - def finalize(self, args): - print >> self.out, 'GATK run summary for :', datetime.datetime.today() - print >> self.out, ' number of runs :', len(self.reports) - print >> self.out, ' number of StingExceptions :', len(filter(isStingException, self.reports)) - print >> self.out, ' number of UserExceptions :', len(filter(isUserException, self.reports)) - print >> self.out, ' users :', ', '.join(set(map(userID, self.reports))) - -def userID(rec): - return rec['user-name'] - -def isStingException(rec): - return rec['exception-at'] != "NA" and rec['is-user-exception'] == "false" - -def isUserException(rec): - return rec['exception-at'] != "NA" and rec['is-user-exception'] == "true" - -addHandler('summary', SummaryReport) - -# -# utilities -# -def openFile(filename, mode='r'): - if ( filename.endswith(".gz") ): - return gzip.open(filename, mode) - else: - return open(filename, mode) - -def resolveFiles(paths): - allFiles = list() - def resolve1(path): - if not os.path.exists(path): - raise Exception("Path doesn't exist: " + path) - elif os.path.isfile(path): - allFiles.append(path) - else: - def one(arg, dirname, files): - #print dirname, files - #print dirname - allFiles.extend(map( lambda x: os.path.join(path, x), files )) - #print files - - os.path.walk(path, one, None) - - map( resolve1, paths ) - return allFiles - -def decodeTime(time): - if time == "ND": - return "ND" - else: - return datetime.datetime.strptime(time.split()[0], "%Y/%m/%d") - #return datetime.datetime.strptime(time, "%Y/%m/%d %H.%M.%S") - -def eltTagEquals(elt, tag, value): - if elt == None: - return False - msgElt = elt.find(tag) - found = msgElt != None and msgElt.text == value - #print 'finding', tag, 'in', elt, msgElt, msgElt.text, found - return found - -def passesFilters(elt): - if OPTIONS.noDev and eltTagEquals(elt.find('argument-collection'),'phone-home-type','DEV'): - #print 'skipping', elt - return False - if OPTIONS.rev != None and not eltTagEquals(elt, 'svn-version', OPTIONS.rev): - return False - if OPTIONS.maxDays != None: - now = datetime.datetime.today() - now = datetime.datetime(now.year, now.month, now.day) - # 2010/08/31 15.38.00 - eltTime = decodeTime(elt.find('end-time').text) - diff = now - eltTime - #print eltTime, now, diff, diff.days - if diff.days > OPTIONS.maxDays: - return False - - return True - -def readReportsSlow(files): - #print files - for file in files: - if OPTIONS.verbose: print 'Reading file', file - input = openFile(file) - try: - tree = ElementTree(file=input) - except: - print "Ignoring excepting file", file - continue - - elem = tree.getroot() - if elem.tag == RUN_REPORT_LIST: - counter = 0 - for sub in elem: - if passesFilters(sub): - counter += 1 - if counter % 1000 == 0: print 'Returning', counter - yield sub - else: - if passesFilters(elem): - yield elem - -def readReports(files): - #print files - for file in files: - if OPTIONS.verbose: print 'Reading file', file - input = openFile(file) - try: - counter = 0 - for event, elem in iterparse(input): - if elem.tag == RUN_REPORT: - if passesFilters(elem): - counter += 1 - #if counter % 1000 == 0: print 'Returning', counter - yield elem - except: - print "Ignoring excepting file", file - continue - - -if __name__ == "__main__": - main() diff --git a/python/change_paths.py b/python/change_paths.py deleted file mode 100644 index 0cff60459..000000000 --- a/python/change_paths.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python - -import fnmatch,os,string,sys - -def add_directory(dirname): - parent = os.path.dirname(dirname) - if not os.path.exists(parent) and parent != '': - add_directory(parent) - if not os.path.exists(dirname): - print 'creating directory',dirname - os.makedirs(dirname) - os.system('svn add %s'%dirname) - -def remove_directory(dirname): - if os.path.exists(dirname): - print 'removing directory',dirname - os.system('svn rm %s'%dirname) -# os.system('rm -rf %s'%dirname) - -def move_file(source_filename,target_filename): - print 'moving %s to %s' % (source_filename,target_filename) - os.system('svn mv %s %s'%(source_filename,target_filename)) -# os.system('mv %s %s'%(source_filename,target_filename)) - -target_public = 'public' -target_private = 'private' - -base_excludes = ['.svn','archive','tribble','integrationtests','settings',target_public,target_private] -private_paths = ['playground','oneoffprojects','oneoffs','archive','analysis','alignment','bwa','c','lua','matlab','perl','python','ruby','R','shell'] -paths_to_trim = ['playground','oneoffprojects','oneoffs'] -source_extensions = ['*.java','*.scala'] - -def intersect(a,b): - return list(set(a) & set(b)) - -def is_source_file(file): - for source_extension in source_extensions: - if fnmatch.fnmatch(file,source_extension): - return True - return False - -def modify_path(path): - tokens = path.split('/') - # compute proper target path: public or private? - if(intersect(tokens,private_paths)): - # kill private directory indicator only if private directory indicator is not the first element in the path. - modified_tokens = [token for token in tokens if token not in paths_to_trim] - return string.join(modified_tokens,'/').replace('./',target_private+'/') - else: - return path.replace('./',target_public+'/') - -add_directory(target_public) -add_directory(target_private) - -# just move archive wholesale; don't worry about processing at this point. -move_file('archive',target_private) - -for root,dirs,files in os.walk('.'): - # filter out non-processed files from root directory and base_excludes - tokens = string.split(root,'/') - if len(tokens) == 1: - continue - if len(intersect(tokens,base_excludes)) > 0: - continue - - # handle file move - for file in ["%s/%s"%(root,file) for file in files]: - modified_path = modify_path(file) - dirname = os.path.dirname(modified_path) - add_directory(dirname) - move_file(file,dirname) - -# handle source code modification -for root,dirs,files in os.walk('.'): - # process only public and private directories - if not (root.startswith('./'+target_public) or root.startswith('./'+target_private)): - continue - for file in ["%s/%s"%(root,file) for file in files if is_source_file(file)]: - f = open(file,'r') - lines = f.readlines() - for i in range(len(lines)): - line = lines[i] - if line.startswith('package') or line.startswith('import'): - tokens = line.split('.') - if intersect(tokens,private_paths): - tokens = [token for token in tokens if token not in paths_to_trim] - modified_line = string.join(tokens,'.') - print "%s: '%s' => '%s'" % (file,line.rstrip(),modified_line.rstrip()) - lines[i] = modified_line - f.close() - f = open(file,'w') - f.writelines(lines) - f.close() - -for file in os.listdir('.'): - if os.path.isdir(file) and not file in base_excludes and not file.startswith(target_public) and not file.startswith(target_private): - remove_directory(file) diff --git a/python/collectCalls.py b/python/collectCalls.py deleted file mode 100755 index b58c47328..000000000 --- a/python/collectCalls.py +++ /dev/null @@ -1,533 +0,0 @@ -#!/usr/bin/env python -# imports -import subprocess -import os -import sys -import re -import time -import math -import codecs -## script keys and triggers -remove_all_files = False # be very careful with this one -make_directories = True -copy_vcf_files = True -annotate_vcf_files = True -run_snp_selector = False -make_hard_threshold_vcf = False -snp_select_threshold_vcf = False -generate_hapmap_info = False -generate_threshold_hapmap_info = False -false_negatives_by_snp_selector = False -false_negatives_by_snp_selector_on_threshold = False -plot_info_field_metrics = False -plot_info_field_metrics_threshold = False -make_best_effort_vcf = False -snp_select_best_effort_vcf = False -plot_best_effort_metrics = False - - -## global stuff -DEBUG = True -override_pilot_bamfile = True -override_production_bamfile = True -matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts" -pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv" -samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/" -home_dir = "/humgen/gsa-scr1/projects/FHS/" -annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/" -figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/" -production_calls_dir = home_dir + "production/raw_production_calls_12_07/" -pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/" -hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/" -pilot_standard_project_name = "framinghampilot" -pilot_clipped_project_name = "pilot_clipped" -pilot_no_gatk_project_name = "pilot_no_gatk" -FHS_project_name = "FHS" -FHS_round_2_project_name = "FHS_round_2" -all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name] -pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name] -production_projects = [FHS_project_name,FHS_round_2_project_name] - -gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list" -dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod" - -vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] - -def subSystem(cmd): - if ( DEBUG ): - print(cmd) - else: - os.system(cmd) - -def safeRemove(cmd): - if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ): - print("Unsafe removal attempt: "+cmd) - else: - subSystem(cmd) - -def productionBamfilePicard(proj, samp): - for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines(): - spline = line.strip().split() - if ( spline[1] == proj and spline[2] == samp ): - return spline[5]+spline[3]+"."+spline[4]+".aligned.bam" - print("no return for: "+proj+"_"+samp) - -def annotationProjectPath(proj): - return annotations_data_base+proj+"/" - -def annotationPath(proj,pool): - return annotationProjectPath(proj)+pool+"/" - -def annotationProjectFile(proj, pool, appender): - return annotationPath(proj,pool)+pool+appender - -def figureProjectPath(proj): - return figures_base+proj+"/" - -def figurePath(proj, samp): - return figureProjectPath(proj)+samp+"/" - -def pipelineProjectPath(proj): - return samples_pipeline_path+proj+"/" - -def pipelinePath(proj, samp): - return pipelineProjectPath(proj)+samp+"/" - -def pipelineBam(proj, samp): - if ( proj == "framinghampilot" and override_pilot_bamfile ): - if ( samp == "CEPH1" ): - return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam" - if ( samp == "CEPH2" ): - return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam" - if ( samp == "CEPH3" ): - return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam" - - if ( proj in production_projects and override_production_bamfile ): - return productionBamfilePicard(proj,samp) - - return pipelinePath(proj,samp)+proj+"."+samp+".bam" - -def pipelineVCF(proj,samp): - return pipelinePath(proj,samp)+samp+".vcf" - -def homePath(proj, samp): - if ( proj in pilot_projects ): - return pilot_calls_dir+proj+"/" - else: - return production_calls_dir+proj+"/" - -def homeProjectFile(proj, samp, appender): - return homePath(proj,samp)+samp+appender - -def homeVCF(proj, samp, extension): - return homeProjectFile(proj,samp,extension)+".vcf" - -def hapmapVCF(proj, samp): - return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf" - -def poolSize(proj): - if ( proj in pilot_projects ): - return 40 - else: - return 28 - -def poolBinding(proj, samp): - if ( proj in production_projects ): - raise Exception, "Production projects do not have hapmap pool bindings" - else: - return hapmap_pool_info_dir+samp+".pool.path" - -def poolSampleNames(proj,samp): - if ( proj in production_projects ): - raise Exception, "Production projects do not have hapmap pool bindings" - else: - return hapmap_pool_info_dir+samp+".pool" - - -## working stuff - -#working_projects = [pilot_standard_project_name] -working_projects = [FHS_round_2_project_name] -working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"] -working_quality_scores = ["0","1","2","3","4","5"] -working_samples = set() -projects_to_samples = [] - -# create the project --> list of samples dictionary - -for project in working_projects: - sample_list = os.listdir(pipelineProjectPath(project)) - projects_to_samples.append([project, sample_list]) - for s in sample_list: - working_samples.add(s) - -projects_to_samples = dict(projects_to_samples) - -# create the sample --> pool name dictionary - -samples_to_pool_name = [] - -for line in open(pipeline_samples_info_path).readlines(): - spline = line.strip().split() - if(spline[0] in working_samples): - keyval = [spline[0], spline[1]] - samples_to_pool_name.append(keyval) - -samples_to_pool_name = dict(samples_to_pool_name) - -# remove everything -if ( remove_all_files ): - for project in working_projects: - safeRemove("rm -rf "+homePath(project,"") ) - safeRemove("rm -rf "+annotationProjectPath(project)) - safeRemove("rm -rf "+figureProjectPath(project)) - -# make directories if necessary -if ( make_directories ): - for project in working_projects: - subSystem("mkdir "+homePath(project,"")) - subSystem("mkdir "+annotationProjectPath(project)) - subSystem("mkdir "+figureProjectPath(project)) - for sample in projects_to_samples[project]: - subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample])) - subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample])) - -# copy the vcf files from the pipeline into the /FHS/ directory -if ( copy_vcf_files ): - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw") - pipeline_vcf = pipelineVCF(project,sample) - subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf) - -# annotate the vcf files using VariantAnnotator -if ( annotate_vcf_files ): - prior_step_extension = "_raw" - this_step_extension = "_annotated" - gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - inputVCF = homeVCF(project, pool, prior_step_extension) - outputVCF = homeVCF(project, pool, this_step_extension) - bamfile = pipelineBam(project, sample) - gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF - subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args) - -# definition for next section (and another one further down) - -def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list): - snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable" - for project in project_list: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - if ( project in pilot_projects ): - truth_arg = " -t "+hapmapVCF(project,sample) - else: - truth_arg = " -titv=3.6" - for info_field in info_field_list: - input_vcf = homeVCF(project,pool,prior_step_extension) - output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field) - log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log") - snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf - subSystem(snp_selector_base+snp_selector_args) - -# run snp selector to analyze info fields -if ( run_snp_selector ): - prior_step_extension = "_annotated" - this_step_extension = "_snpsel" - runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields) - -# definition for the next stage - -def parseInfoField(info): - info_list = info.split(";") - info_dict = [] - for info_field in info_list: - keyval = info_field.split("=") - key = keyval[0] - val = float(keyval[1]) - info_dict.append([key, val]) - return dict(info_dict) - - -def vcfLinePassesThreshold(line, fields, vals,greaterthan): - if( line.startswith('#') ): - return True - else: - # parse the line - general_fields = line.split() - info_dict = parseInfoField(general_fields[vcf_header.index("INFO")]) - header_set = set(vcf_header) - # compare with thresholds - for j in range(len(fields)): - if ( fields[j] in header_set ): - if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ): - if ( not greaterthan[j] ): - return False - else: - continue - else: - if ( greaterthan[j] ): - return False - else: - continue - elif ( fields[j] in info_dict ): - if ( float(info_dict[fields[j]]) > vals[j] ): - if ( not greaterthan[j] ): - return False - else: - continue - else: - if ( greaterthan[j] ): - return False - else: - continue - else: - print("Field not found "+fields[j]) - return True - -def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater): - if ( DEBUG ): - filtered_lines = 0 - else: - out_vcf = open(this_vcf,'w') - print("open for writing: "+this_vcf) - wrotelines = 0 - for line in open(prev_vcf).readlines(): - if ( vcfLinePassesThreshold(line,fields,vals,greater) ): - if ( DEBUG ): - filtered_lines = filtered_lines + 1 - else: - out_vcf.write(line) - wrotelines=wrotelines+1 - if ( DEBUG ): - print(this_vcf+" would have filtered "+str(filtered_lines)+" variants") - else: - out_vcf.close() - -# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering ) -if ( make_hard_threshold_vcf ): - prior_step_extension = "_annotated" - this_step_extension = "_hard_threshold" - threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"] - threshold_values = [15, 0.4,280] - threshold_greaterthan = [True, False,True] - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - prev_vcf = homeVCF(project,pool,prior_step_extension) - this_vcf = homeVCF(project,pool,this_step_extension) - makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan) - -# run snp selector on hard-threshold vcf files -if ( snp_select_threshold_vcf ): - prior_step_extension = "_hard_threshold" - this_step_extension = "_hard_snpsel" - runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields) - -# definitions for next few steps - -def hapmapInfoFile(proj,pool,info,qscore): - return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt") - -def generateHapmapInfo(extension, qscorelist): - gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo" - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - for infofield in working_info_fields: - inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield) - inputBam = pipelineBam(project, sample) - ps = str(poolSize(project)) - for qscore in qscorelist: - outputFile = hapmapInfoFile(project, pool, infofield, qscore) - gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore - subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args) - -# generate hapmap (false postive/false negative) info files on filtered vcfs -if ( generate_hapmap_info ): - prior_step_extension = "_snpsel" - generateHapmapInfo(prior_step_extension, working_quality_scores) - -if ( generate_threshold_hapmap_info ): - prior_step_extension = "_hard_snpsel" - generateHapmapInfo(prior_step_extension, working_quality_scores) - -# definitions for next section -def falsenegSnpSelect(invcf, false_neg_vcf, qscore): - tmp_vcf = "tmp.vcf" - tmp_recal_vcf = "tmp2.vcf" - tmp_log = "tmp.log" - snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf - cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf - subSystem(cmd) - subSystem(snpsel_base) - cmd = "rm "+tmp_vcf - subSystem(cmd) - cmd = "rm "+tmp_recal_vcf - subSystem(cmd) - cmd = "rm "+tmp_log - subSystem(cmd) - -def falseNegsBySnpSelector(extension): - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - for infofield in working_info_fields: - input_vcf = homeVCF(project,pool,extension+"_"+infofield) - for qscore in working_quality_scores: - false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf") - falsenegSnpSelect(input_vcf,false_neg_vcf,qscore) - -# generate false negative sites in a vcf file via snp selector -if ( false_negatives_by_snp_selector): - prior_step_extension = "_snpsel" - falseNegsBySnpSelector(prior_step_extension) - -# generate false negative sites ina vcf file on the thresholded vcfs -if ( false_negatives_by_snp_selector_on_threshold ): - prior_step_extension = "_hard_snpsel" - falseNegsBySnpSelector(prior_step_extension) - -# definition for matlab section -def makeMatlabMetricPlot(extension): - for project in working_projects: - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - for infofield in working_info_fields: - log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log") - figureName = figurePath(project,pool)+pool+extension - cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\"" - subSystem(cmd) - -# make matlab plots of the metrics (generated from the log files) -if ( plot_info_field_metrics ): - prior_step_extension = '_snpsel' - makeMatlabMetricPlot(prior_step_extension) - -# make matlab plots of the metrics from the hard-thresholded log files -if ( plot_info_field_metrics_threshold ): - prior_step_extension = "_hard_snpsel" - makeMatlabMetricPlot(prior_step_extension) - -# definitions for best effort section - -def bestEffortVCF(proj): - bevcf_path = homeVCF(proj,proj,"_best_effort") - #bevcf = open(bevcf_path,'w') - #bevcf.write("##source=best_effort_vcf") - #bevcf.write("##format=VCRv3.2") - #bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls") - #bevcf.close() - return bevcf_path - -def keyFromFields(fields): - return ":".join([fields[0],fields[1]]) - -def inVCFDict(vdict, fields): - return ( keyFromFields(fields) in vdict ) - -def addToVCFDict(vdict, fields): - vdict[keyFromFields(fields)] = fields - return vdict - -def updateInfoField(key, info1, info2): - if ( key == "HRun" ): # this one doesn't change - return [key, str(info1)] - else: - return [key, str( (float(info1) + float(info2))/2 )] - -def incorporateNewInfo(vdict, infodict, fields): - addNPools = True - poskey = keyFromFields(fields) - vline = vdict[poskey] - vinfodict = parseInfoField(vline[vcf_header.index("INFO")]) - newInfo = [] - all_keys = set(vinfodict.keys()).union(set(infodict.keys())) - for infokey in all_keys: - if ( infokey in vinfodict and infokey in infodict ): - newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) ) - elif ( infokey in infodict ): - newInfo.append("=".join([infokey, str(infodict[infokey])]) ) - elif ( infokey == "nPools" ): - newPools = str( int(vinfodict[infokey]) + 1 ) - newInfo.append("=".join([infokey,newPools])) - addNPools = False - else: - continue - if ( addNPools ): - newInfo.append("nPools=1") - vline[vcf_header.index("INFO")] = ";".join(newInfo) - vdict[poskey] = vline - return vdict - -def incorporateQuality(vdict, fields): - poskey = keyFromFields(fields) - vfields = vdict[poskey] - qual_index = vcf_header.index("QUAL") - vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) ) - vdict[poskey] = vfields - return vdict - -def bestEffortMerge(call_dict,vcf_to_merge_path): - for line in open(vcf_to_merge_path).readlines(): - if ( line.startswith("#") ): - continue - else: - vcf_fields = line.strip().split() - if ( not inVCFDict(call_dict, vcf_fields) ): - call_dict = addToVCFDict(call_dict,vcf_fields) - else: - vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")]) - call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields) - call_dict = incorporateQuality(call_dict,vcf_fields) - return call_dict - -def dictToFile(vdict, filepath,source,samplename): - file = open(filepath,'w') - file.write("##source="+source) - file.write("##version=VCRv3.2") - file.write("#"+"\t".join(vcf_header)+"\t"+samplename) - if ( vdict ): - for key in vdict.keys(): - file.write("\t".join(vdict[key])+"\n") - file.close() - else: - print("dict had no keys for "+filepath) - -# make the best effort VCF -if ( make_best_effort_vcf ): - best_thresh_fields = ["QUAL"] - best_thresh_val = [7] - best_thresh_gt = [True] - prev_extension = "_annotated" - for project in working_projects: - best_effort_vcf = bestEffortVCF(project) - best_effort_dict = dict() - sample_list = projects_to_samples[project] - for sample in sample_list: - pool = samples_to_pool_name[sample] - in_vcf = homeVCF(project,pool,prev_extension) - outvcf = homeVCF(project,pool,"_temporary") - makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt) - print("dict about to update, size is currently: "+str(len(best_effort_dict))) - best_effort_dict = bestEffortMerge(best_effort_dict,outvcf) - print("updated size: "+str(len(best_effort_dict))) - subSystem("rm "+outvcf) - dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort") - -if ( snp_select_best_effort_vcf ): - snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields) - for project in working_project: - best_effort_vcf = bestEffortVCF(project) - snp_recal_vcf = homeVCF(project,project,"_best_effort_selected") - log = annotationProjectFile(project,project,"_best_effort_selected.log") diff --git a/python/countCoverageWithSamtools.py b/python/countCoverageWithSamtools.py deleted file mode 100644 index 70d3c0cb9..000000000 --- a/python/countCoverageWithSamtools.py +++ /dev/null @@ -1,46 +0,0 @@ -import farm_commands -import os.path -import sys - -for line in open(sys.argv[1]): - fastb = line.strip() - head, fastb_filename = os.path.split(fastb) - filebase = os.path.splitext(fastb_filename)[0] - fasta = filebase + '.fasta' - - # convert the fasta - if not os.path.exists(fasta): - cmd = "Fastb2Fasta IN="+fastb+" OUT="+fasta - farm_commands.cmd(cmd) - - qualb = os.path.join(head, filebase + '.new.qualb') - quala = filebase + '.quala' - if not os.path.exists(quala): - cmd = "Qualb2Quala IN="+qualb+" OUT="+quala - farm_commands.cmd(cmd) - - fastq = filebase + '.fastq' - if not os.path.exists(fastq): - cmd = "FastaQuals2Fastq.py "+fasta+" "+quala+ " "+fastq - farm_commands.cmd(cmd) - - filteredFastq = filebase + '.filtered.fastq' - if not os.path.exists(filteredFastq): - cmd = "/seq/dirseq/maq-0.7.1/maq catfilter "+fastq+" > "+filteredFastq - farm_commands.cmd(cmd) - - filteredFasta = filebase + '.filtered.fasta' - print 'Looping' - if not os.path.exists(filteredFasta): - out = open(filteredFasta,'w') - iter = open(filteredFastq).__iter__(); - for line in iter: - if line[0] == '@': - print >> out, '>%s' % line[1:].strip() - print >> out, iter.next().strip() - - sam = filebase + '.sam' - cmd = "bwahuman samse 32 " + fastq + " " + sam - print cmd - -samtools view tcga-freeze3-tumor.rev_2.bam chr1:15,000,000-15,002,000 | wc --lines \ No newline at end of file diff --git a/python/createCaseControlMetaData.py b/python/createCaseControlMetaData.py deleted file mode 100644 index 9d24e532b..000000000 --- a/python/createCaseControlMetaData.py +++ /dev/null @@ -1,61 +0,0 @@ -import sys -import os -import subprocess -import shlex - -from optparse import OptionParser - -def parseInput(fList,ignoreExt = None): - inNames = [] - for ele in fList: - if ( ignoreExt != None and ele.endswith(ignoreExt) ): - inFileNames.append(ele) - if ( os.path.exists(ele) ): - for line in open(ele).readlines(): - inNames.append(line.strip()) - return inNames - -def bamsWithSamples(bamList): - cmdbase = "samtools view -H %s | grep SM | tr '\\t' '\\n' | grep SM | sed 's/SM://g' | uniq" - sam2bam = dict() - for bf in bamList: - if ( not os.path.exists(bf) ): - raise IOError("Bam file "+bf+" does not exist") - cmd = cmdbase % bf - proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) - proc.wait() - stdout = proc.stdout.readlines() - if ( len(stdout) > 1 ): - raise RuntimeError("The bam file "+bf+" contains multiple different sample entries") - sm = stdout[0].strip() - sam2bam[sm]=bf - return sam2bam - -def runMain(opt,arg): - bamFiles = bamsWithSamples(parseInput(opt.bam_files,"bam")) - caseNames = set(parseInput(opt.cases)) - controlNames = set(parseInput(opt.controls)) - output = open(opt.output,'w') - output.write("samples:") - sample_base = "- id: %s\n properties:\n cohort: %s\n bam: %s" - for s in bamFiles.keys(): - cc = "Unknown" - if ( s in caseNames ): - cc = "case" - if ( s in controlNames ): - cc = "control" - output.write("\n" + sample_base % (s,cc,bamFiles[s])) - output.close() - -def main(): - usage = "usage: %prog [options] arg" - parser = OptionParser(usage) - parser.add_option("-I","--bams",dest="bam_files",help="the bam files, as multiple arguments or a simple newline-delimited file",action="append") - parser.add_option("-A","--case",dest="cases",action="append",help="A list of the case samples, as multiple arguments or a simple newline-delimited file") - parser.add_option("-O","--control",dest="controls",action="append",help="A list of the control samples, multiple arguments or a newline-delimited file") - parser.add_option("-o","--out",dest="output",action="store",help="Name of the output metadata file to write to") - (options,args) = parser.parse_args() - runMain(options,args) - -if __name__ == "__main__": - main() diff --git a/python/dataProcessingPaper.py b/python/dataProcessingPaper.py deleted file mode 100755 index 4fcd251b3..000000000 --- a/python/dataProcessingPaper.py +++ /dev/null @@ -1,391 +0,0 @@ -from farm_commands2 import * -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string -from madPipelineUtils import * - -EXCLUDE_CHRS = ['chrM', 'chrY'] -EXTRA_GATK_ARGS = ' -XL chrM -XL chrY ' # -XL chrX -XL chrY ' -#EXTRA_GATK_ARGS = ' -XL chrM -XL chrY ' # -XL chrX -XL chrY ' -VALIDATION_DIR = '/humgen/gsa-hpprojects/GATK/data/Comparisons' -BAM_ROOT = '/humgen/1kg/analysis/bamsForDataProcessingPapers/' -WE_LIST = '/seq/references/HybSelOligos/whole_exome_agilent_designed_120/whole_exome_agilent_designed_120.targets.interval_list' -WGS_FILTER = [['ABFilter', 'AB > 0.75 && DP > 40'], ['DPFilter', 'DP > 120 || SB > -0.10']] # , ['QDFilter', 'QD < 5.0 && DP > 40']] -WE_FILTER = [['ESPStandard', 'AB > 0.75 || QD < 5.0 || HRun > 3 || SB > -0.10']] - -#UG_ARGS = "-mbq 20 -mmq 20 -stand_call_conf 50 -stand_emit_conf 10 -hets 0.78e-3 -dcov 10000 -pnrm GRID_SEARCH" -UG_ARGS = "-stand_call_conf 10 -stand_emit_conf 10 --downsampling_type BY_SAMPLE -dcov 1000 -hets 0.78e-3" # experimental arguments for GdA test - -class CallTarget: - def __init__(self, name, bam, interval = '', callArgs = "", b36 = False, optimize = True, filters = [], targetTiTv = 2.07, maxClusters = 16, minQual = 300, tranchToTake = 1): - self.name = name - self.bam = bam - self.interval = interval - self.callArgs = callArgs - self.vcfs = [] # list of processed vcf - self.b36 = b36 - self.filters = filters - self.optimize = optimize - self.targetTiTv = targetTiTv - self.maxClusters = maxClusters - self.tranchToTake = tranchToTake - self.minQual = minQual - - def getCallArgs(self): - return self.callArgs # + self.getIntervalArg() - - def getIntervalArg(self): - if self.hasInterval(): - return ' -L ' + self.interval - else: - return '' - - def hasInterval(self): - return self.interval != '' and self.interval != None - - def getVcf(self): - return os.path.join(OPTIONS.dir, self.name + ".vcf") - - def getVcfs(self): - return self.vcfs - - def addVcf(self, vcf): - self.vcfs.append(vcf) - - def getBam(self): - return self.bam - -KG_PATH = '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls' -TECH_COMP = '/humgen/gsa-hphome1/kiran/one_off_projects/multiTechComparisons/results/v7/NA12878' - -#WGS_INTERVAL = 'chr1' -#WGS_INTERVAL = '-L chr1:1-50,000,000' - -def weTarget(name, bam, ignore = '', args = '', filters = None): - # 3.0 was old target, new is 2.8 - return CallTarget(name, bam, interval = WE_LIST, callArgs = args, filters = WE_FILTER, targetTiTv = 2.8, maxClusters = 12, minQual = 2800, tranchToTake = 10) - -#TARGETS_BY_STRATEGY = [['', ''], ['.OQ', '-OQ'], ['.OQ.noCM', '-OQ -bm THREE_STATE'], ['.noCM', '-bm THREE_STATE']] -TARGETS_BY_STRATEGY = [['', ''], ['.OQ', '-OQ']] -#TARGETS_BY_STRATEGY = [['', '']] -def targetsByStrategy(func, rootName, bam, interval = '', args = '', filters = []): - def makeTarget(ext, moreArgs): - name = rootName + ext - return func(name, bam, interval, args + ' ' + moreArgs, filters = filters) - - if "cleaned" in rootName or "CG" in rootName: - strats = [TARGETS_BY_STRATEGY[0]] - else: - strats = [TARGETS_BY_STRATEGY[1]] - return map(lambda x: makeTarget(*x), strats) - -targets = [] - -def findTargets(names): - def find1(name): - for target in targets: - if target.name == name: - return target - return None - return map(find1, names) - -def matches(string, pattern): - return string.find(pattern) != -1 - -def main(): - global OPTIONS, targets - usage = "usage: %prog stage [options]" - parser = OptionParser(usage=usage) - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="If provided, print out a lot of information") - parser.add_option("", "--byQEval", dest="byQEval", - action='store_true', default=False, - help="If provided, variant eval will be run by Q threshold") - parser.add_option("-s", "--splitByChr", dest="splitByChr", - action='store_true', default=False, - help="If provided, we'll parallelize by chromosome over the farm") - parser.add_option("", "--dev", dest="dev", - action='store_true', default=False, - help="If provided, we'll use the GATK dev build") - parser.add_option("-d", "--dir", dest="dir", - type='string', default="", - help="If provided, this is the root where files are read and written") - parser.add_option("-L", "", dest="WGSIntervals", - type='string', default=None, - help="If provided, these are the interval files we will process for WGS") - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-p", "--parallel", dest="parallel", - type="int", default=None, - help="Number of parallel shared memory threads") - parser.add_option("-t", "--target", dest="target", - type="string", default=None, - help="Only run jobs with names containing this string") - parser.add_option("", "--noRaw", dest="noRaw", - action="store_true", default=False, - help="Exclude raw calls from output") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - # set up targets - # WGS - WGS_INTERVAL = OPTIONS.WGSIntervals - #targets += targetsByStrategy(CallTarget, 'GA2.WGS.cleaned', BAM_ROOT + '/NA12878.GA2.WGS.bwa.cleaned.bam', WGS_INTERVAL, filters = WGS_FILTER) - #targets.append(CallTarget('GA2.WGS.raw', '/seq/dirseq/pem/seq/picard_aggregation/G2946gaII/NA12878/v1/NA12878.bam', WGS_INTERVAL, filters = WGS_FILTER)) - - # HiSeq - targets += targetsByStrategy(CallTarget, 'HiSeq.WGS.raw', '/seq/dirseq/pem/seq/picard_aggregation/G2946/NA12878/v1/NA12878.bam', WGS_INTERVAL, filters = WGS_FILTER) - #targets += targetsByStrategy(CallTarget, 'HiSeq.WGS.cleaned', '/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/tmp.list', WGS_INTERVAL, filters = WGS_FILTER) - targets += targetsByStrategy(CallTarget, 'HiSeq.WGS.cleaned', BAM_ROOT + '/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam', WGS_INTERVAL, filters = WGS_FILTER) - - # WE - targets += targetsByStrategy(weTarget, 'GA2.WEx.cleaned', BAM_ROOT + '/NA12878.WEx.cleaned.recal.bam') - targets += targetsByStrategy(weTarget, 'GA2.WEx.raw', '/seq/picard_aggregation/C308/NA12878/v3/NA12878.bam') - #targets.append(weTarget('GA2.WEx.raw', '/seq/picard_aggregation/C308/NA12878/v3/NA12878.bam')) - - #targets += targetsByStrategy(CallTarget, 'CG.WGS.raw', '/seq/complete_genomics/GS00106-DNA_E01-180_NA12878/SAM0/merge/NA12878.bam', WGS_INTERVAL, filters = WGS_FILTER) - - # CG - # todo -- fixme -- needs genome-wide bams on hg18 - #targets.append(CallTarget('CG.chr1.raw', '/humgen/gsa-hphome1/kiran/one_off_projects/multiTechComparisons/results/v5/NA12878/CG.full/sample.chr1.primaryAlignmentsMarked.dupesRemoved.bam', WGS_INTERVAL, filters = WGS_FILTER, callArgs = "-bm THREE_STATE", b36 = True)) - - # low-pass - # '/humgen/gsa-hpprojects/1kg/1kg_pilot1/freeze5_merged/low_coverage_CEU.1.bam - # targets.append(CallTarget('CEU.lowpass.cleaned', 'CEU.bam.list', WGS_INTERVAL, b36 = True)) - - # 1KG SLX - # targets.append(CallTarget('1KG.NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/useTheseBamsForAnalyses/NA12878.SLX.bam', WGS_INTERVAL, b36 = True)) - - # MCDK1 special case - #MCKD1_INTERVAL = "chr1:152448527-154998173" - #targets.append(CallTarget('MCKD1.raw', '/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/MCKD1_WGS/MCKD1.bam.list', MCKD1_INTERVAL, filters = WGS_FILTER, callArgs = '-bm THREE_STATE')) - #targets.append(CallTarget('MCKD1.cleaned', '/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/MCKD1_WGS/bams_linkage/MCKD1.bam.cleaned.bam.chr1:152448527-154998173.bam', MCKD1_INTERVAL, filters = WGS_FILTER, callArgs = '-bm THREE_STATE')) - - stages = map(string.lower, args[0].split(",")) - STAGES = ['callsnps', 'callindels', 'indelmask', 'snpfilter', 'indelfilter', 'to_hg18', 'optimize', 'eval', 'confusion_matrix'] - for stage in stages: - if stage not in STAGES: - sys.exit('unknown stage ' + stage) - - if OPTIONS.dir != "" and not os.path.exists(OPTIONS.dir): - os.makedirs(OPTIONS.dir) - - allJobs = [] - - def includeStage(name): - return name in stages - - for callTarget in targets: - if "raw" in callTarget.name and OPTIONS.noRaw: - print 'Skipping raw data', callTarget - continue - - # setup pipeline args - GATK_JAR = GATK_STABLE_JAR - if ( OPTIONS.dev ): GATK_JAR = GATK_DEV_JAR - myPipelineArgs = PipelineArgs(GATK_JAR = GATK_JAR, name = callTarget.name, excludeChrs = EXCLUDE_CHRS) - myPipelineArgs.addGATKArg(EXTRA_GATK_ARGS) - myPipelineArgs.addGATKArg(callTarget.getIntervalArg()) - if ( OPTIONS.parallel != None ): myPipelineArgs.addGATKArg(' -nt ' + OPTIONS.parallel) - if ( callTarget.b36 ): myPipelineArgs.ref = 'b36' - print callTarget.name, callTarget.b36, myPipelineArgs.ref - - lastJobs = None - if OPTIONS.target != None and not matches(callTarget.name, OPTIONS.target): - print 'Skipping target', callTarget - continue - - def updateNewJobs(newjobs, lastJobs): - if OPTIONS.verbose: - print 'New jobs', newjobs - #for job in newjobs: - # print ' job ', job - allJobs.append(newjobs) - if newjobs != []: - lastJobs = newjobs - return [], lastJobs - - newJobs = [] - def execStage(name, func, vcf = None, lastJobs = []): - if OPTIONS.verbose: print 'Name is', name - newJobs, newVcf = func(myPipelineArgs, callTarget, vcf, lastJobs) - if newVcf != None: vcf = newVcf - if OPTIONS.verbose: print 'VCF is', vcf - callTarget.addVcf(vcf) - if includeStage(name): newJobs, lastJobs = updateNewJobs(newJobs, lastJobs) - if OPTIONS.verbose: print 'execStage:', newJobs, lastJobs, vcf - return newJobs, lastJobs, vcf - - - newJobs, callSNPJobs, vcf = execStage('callsnps', callSNPs) - newJobs, lastJobs, vcf = execStage('to_hg18', convertToHg18, vcf, callSNPJobs) - newJobs, filterSNPJobs, vcf = execStage('snpfilter', filterSNPs, vcf, callSNPJobs) - - # indel jobs - newJobs, callIndelJobs, vcf = execStage('callindels', callIndels, vcf) - newJobs, indelMaskJobs, vcf = execStage('indelmask', createIndelMask, vcf, callIndelJobs) - newJobs, filterIndelJobs, vcf = execStage('indelfilter', filterIndels, vcf, indelMaskJobs + filterSNPJobs) - - # optimization - newJobs, optimizeJobs, vcf = execStage('optimize', VariantOptimizer, vcf, filterIndelJobs) - - # eval - newJobs, evalJobs, vcf = execStage('eval', evalSNPs, vcf, optimizeJobs) - - # newJobs, lastJobs, ignore = execStage('confusion_matrix', computeConfusionMatrix, vcf) - - print 'EXECUTING JOBS' - executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - -# -# Actual commands -# -def convertToHg18( myPipelineArgs, callTarget, vcf, lastJobs ): - if callTarget.b36: - outputVCF = vcf.replace(".b36", "") - cmd = 'python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/vcf_b36_to_hg18.py ' + vcf + ' ' + outputVCF - jobs = [FarmJob(cmd, jobName = callTarget.name + '.' + 'b36ToHg18', dependencies = lastJobs)] - return jobs, outputVCF - else: - return [], vcf - -def callSNPs( myPipelineArgs, callTarget, ignore, lastJobs ): - outputVCF = appendExtension(callTarget.getVcf(), "ug") - if callTarget.b36: - outputVCF = appendExtension(outputVCF, "b36") - print 'outputVCF', outputVCF - ugArgs = '-T UnifiedGenotyperV2 -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s %s -o %s %s' % (callTarget.getBam(), UG_ARGS, outputVCF, callTarget.getCallArgs()) - farmCmds = simpleGATKCommand( myPipelineArgs, 'call.UG', ugArgs, lastJobs ) - if OPTIONS.splitByChr and not callTarget.hasInterval(): - farmCmds = splitGATKCommandByChr( myPipelineArgs, farmCmds[0], [outputVCF], [mergeVCFs] ) - return farmCmds, outputVCF - -INDEL_MASK_SIZE = 10 -def getIndelCallFiles(callTarget): - outputBed = appendExtension(callTarget.getVcf(), "indels.bed", False) - outputVerbose = appendExtension(callTarget.getVcf(), "indels.verbose.txt", False) - outputVCF = appendExtension(callTarget.getVcf(), "indels.vcf", False) - outputMask = appendExtension(callTarget.getVcf(), "indels.%d.mask" % INDEL_MASK_SIZE, False) - return outputBed, outputVerbose, outputVCF, outputMask - -def callIndels( myPipelineArgs, callTarget, ignore, lastJobs ): - outputBed, outputVerbose, indelsVCF, outputMask = getIndelCallFiles(callTarget) - IGV2_ARGS = '-T IndelGenotyperV2 -ws 500 -I %s -bed %s -verbose %s -o %s -rf Platform454' % (callTarget.getBam(), outputBed, outputVerbose, indelsVCF) - farmCmds = simpleGATKCommand( myPipelineArgs, 'call.CallIndels', IGV2_ARGS, lastJobs ) - if OPTIONS.splitByChr and not callTarget.hasInterval(): - farmCmds = splitGATKCommandByChr( myPipelineArgs, farmCmds[0], [outputBed, outputVerbose], [mergeByCat, mergeByCat] ) - return farmCmds, None - -def createIndelMask( myPipelineArgs, callTarget, vcf, lastJobs ): - outputBed, outputVerbose, outputVCF, outputMask = getIndelCallFiles(callTarget) - cmd = 'python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/makeIndelMask.py %s %d %s' % (outputBed, INDEL_MASK_SIZE, outputMask) - jobs = [FarmJob(cmd, jobName = callTarget.name + '.' + 'makeIndelMask', dependencies = lastJobs)] - return jobs, None - -def filterSNPs(myPipelineArgs, callTarget, vcf, lastJobs ): - out = appendExtension(vcf, 'snpfiltered') - filterString = ' '.join(map(lambda x: '--filterName %s --filterExpression "%s"' % (x[0], x[1]), callTarget.filters)) - return simpleGATKCommand( myPipelineArgs, 'call.filterSNPs', '-T VariantFiltration -B:variant,VCF %s -o %s --filterName LowQual --filterExpression "QUAL < 50.0" --clusterWindowSize 10 --filterName HARD_TO_VALIDATE --filterExpression "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)" %s' % ( vcf, out, filterString ), lastJobs ), out - -def filterIndels(myPipelineArgs, callTarget, vcf, lastJobs ): - out = appendExtension(vcf, 'indelfiltered') - outputBed, outputVerbose, outputVCF, outputMask = getIndelCallFiles(callTarget) - return simpleGATKCommand( myPipelineArgs, 'call.filterIndels', '-T VariantFiltration -B:variant,VCF %s -o %s --maskName Indel -B:mask,Bed %s' % ( vcf, out, outputMask ), lastJobs ), out - -def VariantOptimizer( myPipelineArgs, callTarget, vcf, lastJobs ): - if callTarget.optimize: - clusterFile = appendExtension(vcf, 'optimized.clusters', False) - tranchesFile = appendExtension(vcf, 'optimized.tranches', False) - optOutVCF = appendExtension(vcf, 'optimized') - out = appendExtension(vcf, 'optimized.cut') - table = appendExtension(vcf, 'optimized.table', False) - - #NCLUSTERS = 4 - #ITERATIONS = 3 - #ITERATION_TO_TAKE = ITERATIONS - DBSNP_PRIOR = 2.0 # dbSNP seems dodger and dodger - IGNORE_FILTERS_CLUSTERING = "-ignoreFilter DPFilter -ignoreFilter ABFilter -ignoreFilter ESPStandard" - #IGNORE_FILTERS_CLUSTERING = "-ignoreFilter DPFilter -ignoreFilter ABFilter -ignoreFilter LowQual -ignoreFilter ESPStandard" - IGNORE_FILTERS_SCORING = IGNORE_FILTERS_CLUSTERING + " -ignoreFilter HARD_TO_VALIDATE" - annotationsToOptimize = ['SB', 'HaplotypeScore', "QD", 'HRun'] - annotationsToOptimizeArg = ' '.join(map(lambda x: '-an ' + x, annotationsToOptimize)) # '' ['DP', 'SB', 'HaplotypeScore', 'MQ', "QD", 'HRun'] - #tranches = ' '.join(map( lambda x: '-tranche ' + str(x), [1, 5, 10])) - tranches = ' '.join(map( lambda x: '-tranche ' + str(x), [0.1, 1, 10])) - maxVariantsToShow = 2500 - #singletonFPRate = 0.2 - #hapmapVCF = '/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg18_fwd.vcf' - hapmapVCF = 'hapmap_analysis/sitesr27_nr.hg18_fwd.vcf' - - REGENERATE_VARIANT_CLUSTERS = True - if ( REGENERATE_VARIANT_CLUSTERS ): - jobs1 = simpleGATKCommand( myPipelineArgs, 'call.GenerateVariantClusters', '-T GenerateVariantClusters -qual %d -std 3.5 -mG %d -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:hapmap,vcf %s -B:input,VCF %s -clusterFile %s %s %s --NoByHapMapValidationStatus' % ( callTarget.minQual, callTarget.maxClusters, hapmapVCF, vcf, clusterFile, annotationsToOptimizeArg, IGNORE_FILTERS_CLUSTERING ), lastJobs ) - #jobs1 = simpleGATKCommand( myPipelineArgs, 'call.GenerateVariantClusters', '-T GenerateVariantClusters -qual %d -std 3.5 -mG %d -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -clusterFile %s %s %s' % ( callTarget.minQual, callTarget.maxClusters, vcf, clusterFile, annotationsToOptimizeArg, IGNORE_FILTERS_CLUSTERING ), lastJobs ) - else: - jobs1 = lastJobs - jobs2 = simpleGATKCommand( myPipelineArgs, 'call.VariantRecalibrator', '-T VariantRecalibrator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -clusterFile %s -o %s --target_titv %f %s -resources ~/dev/GenomeAnalysisTK/trunk/R/ %s -priorDBSNP %.2f -tranchesFile %s' % ( vcf, clusterFile, optOutVCF, callTarget.targetTiTv, IGNORE_FILTERS_SCORING, tranches, DBSNP_PRIOR, tranchesFile ), jobs1 ) - - cmd21 = 'python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/vcf2table.py -f CHROM,POS,ID,AC,AF,AN,DB,' + ','.join(annotationsToOptimize) + ' ' + vcf + ' -o ' + table - jobs21 = [FarmJob(cmd21, jobName = callTarget.name + '.call.' + 'VariantRecalibrationReport.vcf2table', dependencies = jobs2)] - - cmd22 = 'Rscript /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/R/VariantRecalibratorReport/VariantRecalibratorReport.R %s %s %s NA %d' % (clusterFile, clusterFile, table, maxVariantsToShow) - jobs22 = [FarmJob(cmd22, jobName = callTarget.name + '.call.' + 'VariantRecalibrationReport.RScript', dependencies = jobs21)] - - jobs3 = simpleGATKCommand( myPipelineArgs, 'call.ApplyVariantCuts', '-T ApplyVariantCuts -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -o %s -tranchesFile %s --fdr_filter_level %f' % ( optOutVCF, out, tranchesFile, callTarget.tranchToTake ), jobs2 ) - return jobs1 + jobs2 + jobs21 + jobs22 + jobs3, out - else: - return [], vcf - -def computeConfusionMatrix(myPipelineArgs, callTarget, vcf, lastJobs ): - out = appendExtension(vcf, 'confusionmatrix', addExtension=False) - CM_ARGS = '-T ComputeConfusionMatrix -I %s -o %s' % (callTarget.getBam(), out) - farmCmds = simpleGATKCommand( myPipelineArgs, 'ConfusionMatrix', CM_ARGS, lastJobs ) - return farmCmds, None - -def evalSNPs(myPipelineArgs, callTarget, vcf, lastJobs): - evalRoot = OPTIONS.dir - - oldMemory = myPipelineArgs.memory - myPipelineArgs.memory = '3g' - def eval1(vcf, namePostfix = "", args = ""): - out = os.path.join(OPTIONS.dir, os.path.basename(vcf) + namePostfix + ".ve2") - maybeHiSeqBindings = "" - hiSeqComp = os.path.join(OPTIONS.dir,"HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf") - omni = " -B:compOmni,VCF Omni.NA12878.hg18.vcf" - if os.path.exists(hiSeqComp): - maybeHiSeqBindings = "-B:comp_HiSeq,VCF " + hiSeqComp + " " - validation_bindings = maybeHiSeqBindings + "-B:comp_p2_val,VCF 1kg_pilot2_snps.hg18.vcf -B:comp_CG,VCF /humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/CG.hg18.vcf -B:compTrio,VCF CEU.trio.2010_03.genotypes.vcf -B:compTrioNovel,VCF CEU.trio.novels.2010_03.genotypes.vcf -B:comp_hm3,VCF /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/by_population/genotypes_CEU_phase3.2_consensus.hg18_fwd.vcf " + omni # not in hg18 space :-( - tranches = "" - if vcf.find("optimized") != -1: - args += " -tf " + appendExtension(vcf.replace(".cut", ""), 'tranches', False) - vcf = appendExtension(vcf.replace(".cut", ""), 'vcf', False) - gatk_args = ("-T VariantEval -reportType Grep -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:eval,VCF %s " + validation_bindings + " -sample NA12878 -o %s -E CompOverlap -E GenotypeConcordance -E TiTvVariantEvaluator -E CountVariants %s") % ( vcf, out, args ) - - name = "EVAL_%s_%s" % (callTarget.name, namePostfix) - return simpleGATKCommand( myPipelineArgs, name, gatk_args, lastJobs )[0] - - jobs = [] - for vcf in callTarget.getVcfs(): - jobs.append(eval1(vcf)) - if OPTIONS.byQEval and vcf.find("optimized") != -1: - for Q in [0.01, 0.02, 0.03, 0.03]: - jobs.append(eval1(vcf, '.Q' + str(Q), '-Q ' + str(Q))) - - myPipelineArgs.memory = oldMemory - return jobs, None - -if __name__ == "__main__": - main() diff --git a/python/expandedSummaryToVCF.py b/python/expandedSummaryToVCF.py deleted file mode 100755 index 8affcfdc4..000000000 --- a/python/expandedSummaryToVCF.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python -#import farm_commands -import os.path -import sys -import re -import time -import math - -def grepFile(string, file): - return grep(string, file.readlines()) - -def grep(string, list): - expr = re.compile(string) - return [elem for elem in list if expr.match(elem)] - -def dictFromCombinedErrorCoverageFile(f): - dictList = [] - for line in f: - ln = line.split() - key = ln[0] - item = ln - dictList.append([key,item]) - return dict(dictList) - -def qualFromLod(L): - try: - X = math.exp(-L) - except OverflowError: - return 0 - try: - return math.floor(-10*math.log10(X/1+X)) - except OverflowError: - return 1000 - -callFileStr = sys.argv[1] -callFile = open(callFileStr) -pipelineSamplesFile = open(sys.argv[2]) -directory = os.getcwd() -syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/" -sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl" - -poolNames = [] -poolInternalIDs = [] -poolCombinedErrorCoverageCalls = [] -proj = "" # required for an output file -for line in pipelineSamplesFile: - ln = line.strip('\n') - ln = ln.split(";") - #ln = line.split("\t") # -depends on format - poolNames.append(ln.pop(2)) - piid = ln.pop(0) - poolInternalIDs.append(piid) - proj = ln.pop(0) - ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls" - print("reading: "+ceccPath) - poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath))) - -pooledOutputFiles = [] -for pool in poolNames: - pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w')) - -pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w') - -header1 = "##format=PCFv1\n" -header2 = "##filedate="+time.strftime("%Y%m%d")+"\n" -header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n" -header4 = "##reference=Homo_sapiens_assembly18\n" -header5 = "##phasing=pooled\n" -header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method - -# print header -pooledCallsFile.write(header1) -pooledCallsFile.write(header2) -pooledCallsFile.write(header3) -pooledCallsFile.write(header4) -pooledCallsFile.write(header5) -pooledCallsFile.write(header6) - -# print rest of headers - -for i in range(len(poolNames)): - header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" - pooledOutputFiles[i].write(header1) - pooledOutputFiles[i].write(header2) - pooledOutputFiles[i].write(header3) - pooledOutputFiles[i].write(header4) - pooledOutputFiles[i].write(header5) - pooledOutputFiles[i].write(header6) - -for line in callFile: -# note: file is a .csv; so comma delimited with headers -# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ... -# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ... -# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools - -# call file assumed already to have been split by pool - -# pileupFile is a .bam.coverage file from the pooled pipeline - -# call file isn't sorted by chr:position -# make the header strings - - if line.startswith("chr:pos"): - continue - elif not line.startswith("chr"): - continue - else: - #print(line) - entries = line.split(",") - chrompos = entries[0] - alleles = entries[1] - variant = alleles[1] - ref = alleles[0] - dbSNP = entries[4] - if dbSNP: - pass - else: - dbSNP="." - - supportingPools = entries.pop(62).rstrip(']').lstrip('[') - supportingPools = supportingPools.split(";") - total_quality = 0 - total_slod = 0 - total_depth = 0 - quality_by_pool = [] - slod_by_pool = [] - depth_by_pool = [] - #sys.exit() - for i in range(len(poolNames)): - #grab line from the correct dict - depth = 0; - slod = 0; - qual = 0; - try: - ceccLine = poolCombinedErrorCoverageCalls[i][chrompos] - depth = ceccLine[18] - qual=qualFromLod(float(ceccLine[21])) - if ceccLine[22] == "NA": - slod = 0; - else: - try: - slod = math.log10(float(ceccLine[23])) - except OverflowError: - slod = -1000; - except KeyError: - # do nothing - pass - if grep(poolInternalIDs[i],supportingPools): - #print this out to the file - chromsplit = chrompos.split(":") - outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n" - pooledOutputFiles[i].write(outstr) - #now update data - total_slod = total_slod + float(slod) - total_depth = total_depth + int(depth) - total_quality = total_quality + qual - depth_by_pool.append(depth) - slod_by_pool.append(slod) - quality_by_pool.append(qual) - #now for the pooled file - chromsplit=chrompos.split(":") - outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB" - pooledCallsFile.write(outstr) - #propagate individual pool information - for i in range(len(poolNames)): - phase = "0/0" - if grep(poolInternalIDs[i],supportingPools): - phase = "0/1" - else: - phase = "0/0" - pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i]) - pooledCallsFile.write(pooledOut) - pooledCallsFile.write("\n") - -## close all files ## - -pooledCallsFile.close() - -for i in range(len(poolNames)): - pooledOutputFiles[i].close() - diff --git a/python/faiReader.py b/python/faiReader.py deleted file mode 100755 index 5e8a872a0..000000000 --- a/python/faiReader.py +++ /dev/null @@ -1,14 +0,0 @@ -from itertools import * - -def readFAI(file): -# 1 247249719 3 60 61 -# 2 242951149 251370554 60 61 -# 3 199501827 498370892 60 61 - return [line.split() for line in open(file)] - - -def readFAIContigOrdering(file): -# 1 247249719 3 60 61 -# 2 242951149 251370554 60 61 -# 3 199501827 498370892 60 61 - return dict([[rec[0], i] for rec, i in izip(readFAI(file), count())]) diff --git a/python/farm_commands.py b/python/farm_commands.py deleted file mode 100644 index 52fac51f5..000000000 --- a/python/farm_commands.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -import subprocess -import re - -#justPrintCommands = False - -def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_commands=False, outputFile = None, waitID = None, jobName = None, die_on_fail = False): - """if farm_queue != False, submits to queue, other -die_on_fail_msg: if != None, die on command failure (non-zero return) and show die_on_fail_msg""" - - if farm_queue: - if outputFile <> None: - farm_stdout = outputFile - elif output_head <> None: - farm_stdout = output_head+".stdout" - else: - #farm_stdout = None - farm_stdout = "%J.lsf.output" - - cmd_str = "bsub -q "+farm_queue - if farm_stdout <> None: - cmd_str += " -o " + farm_stdout - - if waitID <> None: - cmd_str += " -w \"ended(%s)\"" % (str(waitID)) - - if jobName <> None: - cmd_str += " -J %s" % (jobName) - - cmd_str += " '"+cmd_str_from_user + "'" - - print ">>> Farming via "+cmd_str - else: - cmd_str = cmd_str_from_user - print ">>> Executing "+cmd_str - - if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands): - return -1 - elif farm_queue: - result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0] - p = re.compile('Job <(\d+)> is submitted to queue') - jobid = p.match(result).group(1) - return jobid - else: - # Actually execute the command if we're not just in debugging output mode - status = os.system(cmd_str) - if not farm_queue: - print "<<< Exit code:", status,"\n" - if die_on_fail != None and status != 0: - print "### Failed with exit code "+str(status)+" while executing command "+cmd_str_from_user - sys.exit() - return int(status) diff --git a/python/farm_commands2.py b/python/farm_commands2.py deleted file mode 100755 index 4da53adb3..000000000 --- a/python/farm_commands2.py +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -import subprocess -import re -import unittest -import tempfile - -def all(iterable): - for element in iterable: - if not element: - return False - return True - -# maximum number of unnamed jobs allowed sa dependencies -MAX_UNNAMED_DEPENDENCIES = 10 - -class FarmJob: - def __init__( self, cmd_str_from_user, jobName = None, outputHead = None, outputFile = None, dependencies = [], dependencyNameString = None, dieOnFail = False, memlimit = None): - self.cmd_str_from_user = cmd_str_from_user - self.jobName = jobName - self.outputHead = outputHead - self.outputFile = outputFile - self.dieOnFail = dieOnFail - self.memlimit = memlimit - - self.dependencies = dependencies - if self.dependencies == None: - self.dependencies = [] - elif type(self.dependencies) != list: - self.dependencies = [self.dependencies] - self.dependencyNameString = dependencyNameString - - if len(self.dependencies) > MAX_UNNAMED_DEPENDENCIES: - depNames = map(FarmJob.getJobName, self.dependencies) - if len(filter(None, depNames)) > 1 and len(self.dependencies) != len(filter(None, depNames)): - # there are some unnamed and some named deps - raise Exception("Bad job names -- some are named and some are unnamed", depName) - - self.jobID = None # None indicates currently unscheduled - self.executionString = None # currently unscheduled - self.executed = False - self.jobStatus = None - - def getJobName(self): - return self.jobName - - def getJobIDString(self): - if self.jobName == None: - if self.jobID == None: - return "UNNAMED" - else: - return str(self.jobID) - else: - return self.getJobName() - - def __str__(self): - return "[JOB: name=%s id=%s depending on (%s) with cmd=%s]" % (self.getJobName(), self.jobID, ','.join(map(FarmJob.getJobIDString, self.dependencies)), self.cmd_str_from_user) - -# def __repr__(self): -# return self.__str__() - - -def longestCommonPrefix(strings): - #print 'LCP', strings - if strings == []: - return "" - else: - l = map(len, strings) - shortestLen = min(l) - #print '*arg', l, shortestLen - for i in range(shortestLen): - c = strings[0][i] - if not all(map(lambda s: c == s[i], strings)): - shortestLen = i - break - - return strings[0][0:shortestLen] - -def jobNameWildcard(jobs): - if len(jobs) == 1: - return jobs[0].getJobName() - else: - return longestCommonPrefix(map(FarmJob.getJobName, jobs)) + "*" - -def executeJobs(allJobs, farm_queue = None, just_print_commands = False, debug = True): - for job in allJobs: - if type(job) == list: - # convenience for lists of lists - map( lambda x: executeJob(x, farm_queue, just_print_commands, debug), job) - else: - print 'Preparing to execute', job - if not job.executed: - # schedule the dependents - executeJobs(job.dependencies, farm_queue, just_print_commands, debug = debug) - executeJob(job, farm_queue, just_print_commands, debug = debug) - print 'Executed', job - -justPrintJobIDCounter = 1 - -def executeJob(job, farm_queue = None, just_print_commands = False, debug = True, die_on_fail = True): - global justPrintJobIDCounter - job.executed = True - - # build my execution string - job.executionString = buildExecutionString(job, farm_queue, debug = debug) - - if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands): - job.jobID = justPrintJobIDCounter - justPrintJobIDCounter += 1 - elif farm_queue: - #print 'job.executionString', job.executionString - result = subprocess.Popen([job.executionString, ""], shell=True, stdout=subprocess.PIPE).communicate()[0] - p = re.compile('Job <(\d+)> is submitted to queue') - job.jobID = p.match(result).group(1) - else: - # Actually execute the command if we're not just in debugging output mode - status = os.system(job.executionString) - if not farm_queue: - print "<<< Exit code:", status,"\n" - if die_on_fail != None and status != 0: - print "### Failed with exit code "+str(status)+" while executing command "+job.cmd_str_from_user - sys.exit() - job.jobStatus = int(status) - -def buildExecutionString(job, farm_queue = None, debug = True): - if farm_queue != None: - if job.outputFile != None: - farm_stdout = job.outputFile - elif job.outputHead != None: - farm_stdout = job.outputHead + ".stdout" - else: - #farm_stdout = None - farm_stdout = "%J.lsf.output" - - cmd_str = "bsub -r -q " + farm_queue - if farm_stdout != None: - cmd_str += " -o " + farm_stdout - - if ( job.memlimit != None ): - cmd_str += " -R \"rusage[mem=" + job.memlimit[0:-1] + "]\"" - - # fixme - if job.dependencies != []: - cmd_str += buildJobDependencyString(job.dependencies) - if job.dependencyNameString != None: - cmd_str += buildJobDependencyString(job.dependencyNameString) - if job.jobName != None: - cmd_str += " -J %s" % job.jobName - - cmd_out, cmd_file = tempfile.mkstemp() - cmd_out = open(cmd_file, 'w') - cmd_out.write(job.cmd_str_from_user) - cmd_out.close() - - cmd_str += " < " + cmd_file + "" - #cmd_str += " '" + job.cmd_str_from_user + "'" - - if debug: print ">>> Farming via "+cmd_str - else: - cmd_str = job.cmd_str_from_user - if debug: print ">>> Executing "+cmd_str - - return cmd_str - -def allJobsAreNamed(jobs): - #print map(lambda x: x != None, map(FarmJob.getJobName, jobs)) - return all(map(lambda x: x != None, map(FarmJob.getJobName, jobs))) - -def buildJobDependencyString(depJobs): - if type(depJobs) == str: - # we are all formally named - depString = "ended(%s*)" % depJobs - elif allJobsAreNamed(depJobs): - # we are all formally named - depString = "ended(%s)" % jobNameWildcard(depJobs) - else: - depString = "&&".join(map(lambda x: "ended(%s)" % x.jobID, depJobs)) - - return " -w \"%s\"" % depString - -def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_commands=False, outputFile = None, waitID = None, jobName = None, die_on_fail = False): - """if farm_queue != False, submits to queue, other -die_on_fail_msg: if != None, die on command failure (non-zero return) and show die_on_fail_msg""" - - if farm_queue: - if outputFile <> None: - farm_stdout = outputFile - elif output_head <> None: - farm_stdout = output_head+".stdout" - else: - #farm_stdout = None - farm_stdout = "%J.lsf.output" - - cmd_str = "bsub -q "+farm_queue - if farm_stdout <> None: - cmd_str += " -o " + farm_stdout - - if waitID <> None: - cmd_str += " -w \"ended(%s)\"" % (str(waitID)) - - if jobName <> None: - cmd_str += " -J %s" % (jobName) - - cmd_str += " '"+cmd_str_from_user + "'" - - print ">>> Farming via "+cmd_str - else: - cmd_str = cmd_str_from_user - print ">>> Executing "+cmd_str - - if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands): - return -1 - elif farm_queue: - result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0] - p = re.compile('Job <(\d+)> is submitted to queue') - jobid = p.match(result).group(1) - return jobid - else: - # Actually execute the command if we're not just in debugging output mode - status = os.system(cmd_str) - if not farm_queue: - print "<<< Exit code:", status,"\n" - if die_on_fail != None and status != 0: - print "### Failed with exit code "+str(status)+" while executing command "+cmd_str_from_user - sys.exit() - return int(status) - - -# ------------------------------------------------------------------------------------------ -# -# Unit testing! -# -# ------------------------------------------------------------------------------------------ -class TestFarmCommands(unittest.TestCase): - def setUp(self): - print '' - print '-' * 100 - - def testMakingJob1(self): - print 'testMakingJob:' - job = FarmJob("foo") - executeJobs([job], just_print_commands = True) - - def testMakingJob2(self): - print 'testMakingJob2:' - job = FarmJob("bar", jobName = "barJobs", outputHead = "outputRoot", outputFile = "bar.log", dependencies = [], dieOnFail = True) - executeJobs([job], "gsa", just_print_commands = True) - - def testDepJobs1(self): - print 'testDepJobs1:' - job1 = FarmJob("job1") - job2 = FarmJob("job2") - job3 = FarmJob("job3", dependencies = [job1, job2]) - executeJobs([job1, job2, job3], "gsa", just_print_commands = True) - - def testDepJobs2(self): - print 'testDepJobs2:' - foojob1 = FarmJob("job1", jobName = "foojob1") - foojob2 = FarmJob("job2", jobName = "foojob2") - barjob1 = FarmJob("barjob1", jobName = "barjob1", dependencies = [foojob1]) - barjob2 = FarmJob("barjob2", jobName = "barjob2", dependencies = [foojob1, foojob2]) - bazjob1 = FarmJob("baz1", jobName = "baz1", dependencies = [barjob1, barjob2]) - executeJobs([bazjob1], "gsa", just_print_commands = True) - -if __name__ == '__main__': - unittest.main() - diff --git a/python/fasta.py b/python/fasta.py deleted file mode 100755 index 231c94dff..000000000 --- a/python/fasta.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python - -import string - -class fasta_record: - "Record containing one FASTA sequence" - def __init__(self, id, seq): - self.id = id - self.seq = seq - - def __str__(self): - return '['+self.id+" "+self.seq+']' - -class fasta_file: - "Iterable object based on FASTA file format" - def __init__(self, filename, cleanup=True): - "cleanup removes spaces from fasta text (default: True)" - self.filename = filename - self.fin = open(self.filename) - self._cleanup = cleanup - - def RecordGenerator(self): - line = self.fin.readline().rstrip() - assert line[0] == ">" - id = line[1:] - seq = "" - for line in self.fin: - line = line.rstrip() - if line[0] == ">": - yield fasta_record(id, seq) - id = line[1:] - seq = "" - else: - if self._cleanup: - seq += line.replace(" ","") - else: - seq += line - - yield fasta_record(id, seq) # Yield last seq - raise StopIteration # No more lines - - def __iter__(self): - return self.RecordGenerator() - - -if __name__ == "__main__": - print "Testing fast.py on file 5seqs.fa..." - for fasta_rec in fasta_file("5seqs.fa"): - print fasta_rec - diff --git a/python/firehose_out_email.py b/python/firehose_out_email.py deleted file mode 100755 index 3e06420fc..000000000 --- a/python/firehose_out_email.py +++ /dev/null @@ -1,63 +0,0 @@ -#this script produces the output to go in emails -import subprocess -import os -import re -import sys -import getopt -import sample_lister - -try: - opts, args = getopt.getopt(sys.argv[1:], "dp:s:") -except getopt.GetoptError, err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - sys.exit(2) - -opts=dict(opts) -givenname=opts['-s'] -projname=opts['-p'] -if '-d' in opts: - dirname=opts['-d'] -else: - dirname="" - -class SampSet(sample_lister.SampleSet): - def __init__(self, projectname, sampset, pathname): - self.sampset=sampset - self.pathname=pathname - self.projectname=projectname - sample_lister.SampleSet.__init__(self, projectname, sampset, pathname) - def evalout(self): - '''This produced the output that needs to go in the emails''' - filename = "/humgen/gsa-firehose/firehose/firehose_output/trunk/Sample_Set/" + self.sampset +"/UnifiedGenotyper/"+ self.sampset+".filtered.eval" - evalfile = open(filename, "r").read() - annotations= ["all", "novel", "known", "snp_at_known_non_snps", "filtered"] - variant=dict(zip(annotations, ('','','','',''))) - ratio=dict(zip(annotations, ('','','','',''))) - bpre=re.compile("all,summary,variant_counts +n bases covered +(\d+)") - size=repr(bpre.search(evalfile).group(1)) - bamsearch="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample_Set/"+self.sampset+"/* -name \*.bam | grep -v unfixed" - bams = subprocess.Popen([bamsearch], shell=True, stdout=subprocess.PIPE).communicate()[0] - sampno=bams.count("bam") - bedsearch="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample_Set/"+self.sampset+"/* -name \*filtered_indels.bed" - beds = subprocess.Popen([bedsearch], shell=True, stdout=subprocess.PIPE).communicate()[0] - vcf="/humgen/gsa-firehose/firehose/firehose_output/trunk/Sample_Set/"+self.sampset+"/UnifiedGenotyper/"+self.sampset+'.maf.annotated.vcf' - for a in annotations: - anregexv = re.compile(a + ",summary,variant_counts +variants +(\d+)") - variant[a] = repr(anregexv.search(evalfile).group(1)) - anregexr = re.compile(a + ",summary,transitions_transversions +ratio +(\d+.\d+|Infinity)") - ratio[a] = repr(anregexr.search(evalfile).group(1)) - out1="Samples processed:"+repr(sampno)+"\n\n Target size: \t" +size+" bp \n\n\t\t\t\t\t Variants \t\t Ti/TV \n (true positives)\t All \t\t " +variant["all"]+ " \t\t " + ratio["all"] +" \n \t\t\t Known \t\t " +variant["known"]+ " \t\t " + ratio['known']+" \n \t\t\t Novel \t\t " +variant["novel"]+" \t\t " + ratio['novel']+ " \n*************************************************************************\n (false \tSNPS at known indels \t " +variant["snp_at_known_non_snps"]+"\t\t\t " + ratio['snp_at_known_non_snps']+ " \n positives) \t\t filtered \t " +variant["filtered"]+" \t\t " + ratio['filtered'] - out2="\n\n\nSNP calls:"+vcf+"\n\nIndel-realigned Bam files:\n"+bams+"\nIndel calls:\n"+beds - if self.pathname == '': - print(out1+out2) - else: - filename=self.pathname+self.sampset+".emailtxt" - putthere=open(filename, "w") - putthere.write(out1+out2) - -target=SampSet(projname,givenname,dirname) -target.evalout() -#TODO: make this send the email when run -#TODO: make this find the list of bams, bed files, and annotated vcfs. diff --git a/python/fixSoapSnp.py b/python/fixSoapSnp.py deleted file mode 100644 index dbd0b545c..000000000 --- a/python/fixSoapSnp.py +++ /dev/null @@ -1,23 +0,0 @@ -import os.path -import sys -from optparse import OptionParser - - -def main(): - global OPTIONS - usage = "usage: %prog [options] cmap input.soapsnp" - parser = OptionParser(usage=usage) - - (OPTIONS, args) = parser.parse_args() - if len(args) != 2: - parser.error("Requires exactly 2 arguments") - - cmap = dict([reversed(line.split()) for line in open(args[0])]) - #print cmap - for line in open(args[1]): - parts = line.split() - mapped = cmap[parts[0]] - print "\t".join([mapped] + parts[1:]) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/python/gatherIndelsToVCF.py b/python/gatherIndelsToVCF.py deleted file mode 100755 index c69a05701..000000000 --- a/python/gatherIndelsToVCF.py +++ /dev/null @@ -1,231 +0,0 @@ -#!/usr/bin/env python - -import os -import sys - -input_verbose_beds = sys.argv[1].split(",") -output_vcf_name = sys.argv[2] - -vcf_header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] -format = "GT:FT" -class Indel: - def __init__(self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding, - consF,consR,refF,refR,consMM,refMM): - self.chrom = chrom - self.start = start - self.stop = stop - self.bases = bases - self.isDeletion = isDeletion - self.isFiltered = isFiltered - self.isHet = isHet - self.sample = sample - self.isCoding = isCoding - self.consForward = int(consF) - self.consReverse = int(consR) - self.refForward = int(refF) - self.refReverse = int(refR) - self.consMM = float(consMM) - self.refMM = float(refMM) - - def getAnnotations__DO_NOT_USE(self): - ## note: log(1/2) ~~ -0.3 - forwardLod = -0.3*(self.consForward+self.refForward) + 2*self.consForward - reverseLod = -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse - totalLod = -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) - strand_score = max(forwardLod-totalLod,reverseLod-totalLod) - SB = "SB="+str(strand_score) - mismatch = "MMR="+str(self.consMM) - ref_mismatch = "refMMR="+str(self.refMM) - return [SB,mismatch,ref_mismatch] - - def getTotalLod(self): - return -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) - - def getFwdLod(self): - return -0.3*(self.consForward+self.refForward) + 2*self.consForward - - def getRevLod(self): - return -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse - -def getAnnotations(indelList): - total_lod = 0.0 - total_fwd_lod = 0.0 - total_rev_lod = 0.0 - avg_mmr = 0.0 - avg_ref_mmr = 0.0 - for indel in indelList: - total_lod += indel.getTotalLod() - total_fwd_lod += indel.getFwdLod() - total_rev_lod += indel.getRevLod() - avg_mmr += indel.consMM - avg_ref_mmr += indel.refMM - avg_mmr = avg_mmr/len(indelList) - avg_ref_mmr = avg_ref_mmr/len(indelList) - strand_score = max(total_fwd_lod-total_lod,total_rev_lod-total_lod) - SB = "SB="+str(strand_score) - mismatch = "MMR="+str(avg_mmr) - ref_mismatch = "refMMR="+str(avg_ref_mmr) - return [SB,mismatch,ref_mismatch] - -def getGenotypes(samples,indels,alts): - s2i = dict() - alts = alts.split(",") - genotypes = list() - for indel in indels: - s2i[indel.sample] = indel - for sample in samples: - if ( sample in s2i.keys() ): - if ( s2i[sample].isDeletion ): - bases = "D"+str(s2i[sample].bases) - else: - bases = "I"+s2i[sample].bases - gt_index = str(1+alts.index(bases)) - if ( s2i[sample].isHet ): - gtstr = "0/"+gt_index - else: - gtstr = gt_index+"/"+gt_index - if ( s2i[sample].isFiltered ): - gtstr += ":1" - else: - gtstr += ":0" - else: - gtstr = "0/0:0" - genotypes.append(gtstr) - return genotypes - -def getAlts(indel_list): - alts = list() - for indel in indel_list: - if ( indel.isDeletion ): - if ( not "D"+indel.bases in alts ): - alts.append("D"+indel.bases) - else: - if ( not "I"+indel.bases in alts ): - alts.append("I"+indel.bases) - - alt = ",".join(alts) - - return alt - -def fixAlts(alts_in): - alts = alts_in.split(",") - fixed_alts = list() - for alt in alts: - if ( alt.startswith("D") ): - fixed_alts.append("D"+str(len(alt)-1)) - else: - fixed_alts.append(alt) - return ",".join(fixed_alts) - -def fixChrom(chrom): - if ( chrom == 0 ): - return "chrM") - if ( chrom == "23" ): - return "chrX" - if ( chrom == "24" ): - return "chrY" - return "chr"+chrom - -def writeVCFLine(out_stream,indel_list,sample_list): - alts = getAlts(indel_list) - ID = "." ## ignore dbsnp annotation for now - chrom = str(indel_list[0].chrom) - start = str(indel_list[0].start) - if ( indel_list[0].isCoding ): - info = "type=Codon;" - else: - info = "type=Intron;" - info += "count="+str(len(indel_list)) - # format is global - def inSampleOrdering(indel1,indel2): - return sample_list.index(indel1.sample)-sample_list.index(indel2.sample) - indel_list.sort(inSampleOrdering) - fixed_alts = fixAlts(alts) - if ( True or not fixed_alts.find(",") > -1 ): - info += ";"+";".join(getAnnotations(indel_list)) - entries = [fixChrom(chrom),start,ID,"A",fixed_alts,"50","0",info,format] - for e in getGenotypes(sample_list,indel_list,alts): - entries.append(e) - out_stream.write("\t".join(entries)+"\n") - -def startCompare(ind1,ind2): - if ( ind1.chrom != ind2.chrom ): - return ind1.chrom - ind2.chrom - else: - return ind1.start - ind2.start - -def output(indels,popname,samples): - outfile = open(output_vcf_name+"_BAD_REFERENCE.vcf",'w') - outfile.write("##VCFv3.2\n##Created by gatherIndelsToVCF.py\n") - outfile.write("\t".join(vcf_header)+"\t"+"\t".join(samples)+"\n") - indels_for_line = list() - for indel in indels: - if ( len(indels_for_line) == 0 or startCompare(indel,indels_for_line[len(indels_for_line)-1]) == 0 ): - indels_for_line.append(indel) - else: - writeVCFLine(outfile,indels_for_line,samples) - indels_for_line = list() - outfile.close() - -def parseSample(filename): - return filename.split(".",1)[0] - -def parseIndels(filePath,sampleName): - f = open(filePath) - indels = list() - for line in f.readlines(): - if ( not line.startswith("[") ): - spline = line.split("\t") - spline[0] = spline[0].split("chr")[1] - if ( spline[0] != "X" and spline[0] != "Y" and spline[0] != "M"): - chrom = int(spline[0]) - else: - if ( spline[0] == "M" ): - chrom = 0 - elif ( spline[0] == "X"): - chrom = 23 - else: - chrom = 24 - start = int(spline[1]) - stop = int(spline[2]) - rawbase = spline[3] - isDeletion = rawbase.startswith("-") - if ( isDeletion ): - bases = spline[3].split("-")[1] - else: - bases = spline[3].split("+")[1] - sample = sampleName - isFiltered = False - if ( line.find("AUTOFILTER") > -1 ): - if ( line.find("CONS_AV_MM") > -1 ): - mm = float(line.split("AV_MM[C/R]:")[1].split("/")[0]) - if ( mm >= 3.5 ): - isFiltered = True - else: - isFiltered = True - isCoding = line.find("CODING") > -1 - isHet = True ## haven't seen a hom yet---todo --- fix this - # STRAND_COUNTS[C/C/R/R]: - strand_counts_field = line.split("STRAND_COUNTS[C/C/R/R]:")[1].split()[0] - strand_counts = strand_counts_field.split("/") - consF = strand_counts[0] - consR = strand_counts[1] - refF = strand_counts[2] - refR = strand_counts[3] - # AV_MM[C/R]: - mismatch_field = line.split("AV_MM[C/R]:")[1].split()[0] - consMM = mismatch_field.split("/")[0] - refMM = mismatch_field.split("/")[1] - # recall order: self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding - indels.append(Indel(chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,consF,consR,refF,refR,consMM,refMM)) - return indels - -def writeVCF(verbose_bed_list): - for file in verbose_bed_list: - samples.append(parseSample(file)) - indels.extend(parseIndels(indel_dir+file,parseSample(file))) - indels.sort(startCompare) - output(indels,popname,samples) - -writeVCF(input_verbose_beds) - diff --git a/python/gatherSampleSummaries.py b/python/gatherSampleSummaries.py deleted file mode 100644 index 371309340..000000000 --- a/python/gatherSampleSummaries.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -base1 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/broad/" -base2 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/intersect/" - -def getField(fname,header): - f = open(fname) - h = f.readline().split("\t") - idx = h.index(header) - return map(lambda x: x.strip().split("\t")[idx],filter(lambda u: not u.startswith("Tot"),f.readlines())) - -def getFiles(base): - num_jobs = len(os.listdir(base+"GATKDispatcher/")) - sum_file = base+"GATKDispatcher/dispatch%d/job%d.sample_summary" - int_file = base+"GATKDispatcher/dispatch%d/job%d.sample_interval_summary" - files = map( lambda x: (sum_file % (x,x), int_file % (x,x)), range(num_jobs)) - return files - -def getBases(base): - files = getFiles(base) - lines = map(lambda x: getField(x[1],"Target"),files) - sizes = map( lambda iList: reduce ( lambda u,v: u + v , map(lambda ival: int(ival.split(":")[1].split("-")[1])-int(ival.split(":")[1].split("-")[0]), iList ) ) , lines ) - return(sizes) - -def getPctAbove(base): - files = getFiles(base) - pct_above = map(lambda y: map(lambda z: float(z),y),map(lambda x: getField(x[0],"%_bases_above_20"),files)) - return pct_above - -def getTotals(base): - files = getFiles(base) - totals = map(lambda y: map(lambda z: int(z),y),map(lambda x: getField(x[0],"total"),files)) - return totals - -bases = getBases(base1) + getBases(base2) -totals = getTotals(base1) + getTotals(base2) -pct_above = getPctAbove(base1) + getPctAbove(base2) -files = getFiles(base1) + getFiles(base2) - -total_bases = reduce(lambda u,v: u + v , bases) - -total_cvg = map(lambda u: 0, range(len(pct_above[0]))) -total_covered_20 = map(lambda u: 0, range(len(pct_above[0]))) -for f_idx in range(len(files)): - for sam_idx in range(len(pct_above[0])): - total_covered_20[sam_idx] += pct_above[f_idx][sam_idx]*bases[f_idx] - total_cvg[sam_idx] += totals[f_idx][sam_idx] - -pct_above_20 = map( lambda x: x/total_bases, total_covered_20) -mean_cvgs = map( lambda x: (x+0.0)/total_bases, total_cvg) -#print(total_bases) -#print(total_cvg) -print("Mean\t%_above_20") -mean_str = map(lambda u: "%.1f" % u, mean_cvgs) -above_str = map(lambda u: "%.1f" % u, pct_above_20) -print(reduce( lambda u,v: u + "\n" + v, map(lambda idx: mean_str[idx]+"\t"+above_str[idx],range(len(mean_str))))) diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py deleted file mode 100755 index ee3514635..000000000 --- a/python/gatkConfigParser.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# GATK configuration parser -# -import ConfigParser -import os.path -import sys - -defaultRequiredOptions = {} -def addRequiredOption(name, type): - defaultRequiredOptions[name] = type - -addRequiredOption('jar', 'input_file') -addRequiredOption('reference', 'input_file') -addRequiredOption('referenceIndex', 'input_file') -addRequiredOption('referenceDict', 'input_file') -addRequiredOption('java', 'file') -addRequiredOption('jvm_args', str) -addRequiredOption('args', str) -addRequiredOption('tmp', 'output_file') - -class gatkConfigParser(ConfigParser.SafeConfigParser): - GATK = 'DEFAULT' - - def __init__(self, configFiles): - ConfigParser.SafeConfigParser.__init__(self) - files = filter(None, configFiles) - print 'Reading configuration file(s):', files - self.read(files) - self.validateRequiredOptions() - self.moreArgs = "" - - def validateRequiredOptions(self): - for key, value in defaultRequiredOptions.iteritems(): - self.validateOption(self.GATK, key, value) - - def validateOption(self, section, name, type = str): - v = self.getOption(section, name, type) - #print ' => Validated option', name, v - - def getGATKOption(self, name, type = str): - return self.getOption(self.GATK, name, type) - - def getGATKModeOption(self, name, mode, type = str): - return self.getOption(mode, name, type) - - def getOption(self, section, name, typeF = None): - if not self.has_option(section, name): - raise "Option %s not found in section %s" % (name, section) - else: - val = self.get(section, name) - if typeF == 'input_file' or typeF == 'output_file': - path = os.path.abspath(os.path.expanduser(val)) - if typeF == 'input_file': - if not os.path.exists(path): - raise "Input file does not exist", path - if not os.access(path, os.R_OK): - raise "Input file cannot be read", path - if typeF == 'output_file': - if not os.access(path, os.W_OK): - raise "Output file cannot be written", path - return path - elif type(typeF) == str: - return str(val) - elif typeF == None: - return val - else: - return typeF(val) - - def java(self): return self.getOption(self.GATK, 'java') - def jvm_args(self): return self.getOption(self.GATK, 'jvm_args') - def jar(self): return self.getOption(self.GATK, 'jar') - def gatk_args(self): return self.getOption(self.GATK, 'args') - def reference(self): return self.getOption(self.GATK, 'reference') - - def setMoreArgs(self, s): - self.moreArgs = s - - def gatkCmd(self, mode, log = None, stdLogName=False): - cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()]) - cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)]) + ' ' + self.moreArgs - if log <> None: - if stdLogName: - #head, ext = os.path.splitext(log) - logName = log + "." + mode + ".log" - else: - logName = log - cmd += ' ' + ' '.join(['-log', logName]) - return cmd - -import unittest -class TestMergeBAMsUtils(unittest.TestCase): - def setUp(self): - configFile = os.path.join(os.path.split(sys.argv[0])[0] + "/../testdata/defaultGATKConfig.cfg") - self.config = gatkConfigParser(configFile) - - def testValidate(self): - self.config.validateRequiredOptions() - - #def testCmd(self): - # s = "java -ea -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:1-10,000,000 -R /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18.fasta -T CountCovariates --MIN_MAPPING_QUALITY 1" - # self.assertEquals(self.config.gatkCmd('CountCovariates'), s) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/python/generate1KGHapmapVCF.py b/python/generate1KGHapmapVCF.py deleted file mode 100755 index 44b58c59b..000000000 --- a/python/generate1KGHapmapVCF.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -import os - -hapmap_dir = os.getcwd()+"/" ##CHANGE ME - -def convert(line): - line = line.replace("chr","",1) - if ( line.startswith("M") ): - line = line.replace("M","MT",1) - return line - -for file in os.listdir(hapmap_dir): - if ( file.endswith('vcf') ): - chrM_lines = list() - print("converting: "+file) - in_vcf = open(hapmap_dir+file) - out_vcf_filename = file.replace("hg18","b36") - out_vcf = open(out_vcf_filename,'w') - for line in in_vcf.readlines(): - if ( line.startswith("#") ): - out_vcf.write(line) - else: - if ( line.startswith("chrM") ): - chrM_lines.append(line) - else: - out_vcf.write(convert(line)) - for line in chrM_lines: - out_vcf.write(convert(line)) diff --git a/python/generate_per_sample_metrics.py b/python/generate_per_sample_metrics.py deleted file mode 100644 index 4092f711e..000000000 --- a/python/generate_per_sample_metrics.py +++ /dev/null @@ -1,106 +0,0 @@ -# -# Reads in selected Picard metrics, generating an R-compatible TSV suitable for pre-QC analysis. -# -# To run: -# /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \ -# -J-classpath $STING_HOME/dist/sam-1.47.869.jar:$STING_HOME/dist/picard-1.47.869.jar:$STING_HOME/dist/picard-private-parts-1941.jar \ -# $STING_HOME/python/generate_per_sample_metrics.py > -# -# To add a new metric: -# - If the metric file is new to Picard, add the relevant parser to the picard-private jar -# (see http://www.broadinstitute.org/gsa/wiki/index.php/Adding_and_updating_dependencies for details). -# - Add the field name to the header array. -# - Add the field data to the statement printing the data array. -# -from java.lang import * -from java.io import File,FileReader - -from edu.mit.broad.picard.genotype.concordance import DbSnpMatchMetrics -from net.sf.picard.analysis import AlignmentSummaryMetrics,InsertSizeMetrics -from net.sf.picard.analysis.directed import HsMetrics -from net.sf.picard.metrics import MetricsFile - -import os,string,sys - -def median(l): - return sorted(l)[(len(l)+1)/2] -def mean(l): - return float(sum(l))/len(l) - -def get_all_metrics(filename): - if not os.path.exists(filename): - return None - file_reader = FileReader(filename) - metrics_file = MetricsFile() - metrics_file.read(file_reader) - metrics = metrics_file.getMetrics() - file_reader.close() - return metrics - -def get_sample_summary_metrics_fields(type): - return [field.getName() for field in type.getFields() if not field.getName().startswith('__')] - -def get_sample_summary_metrics(filename): - if not os.path.exists(filename): - return None - file_reader = FileReader(filename) - metrics_file = MetricsFile() - metrics_file.read(file_reader) - metrics = metrics_file.getMetrics()[0] - file_reader.close() - return metrics - -if len(sys.argv) != 2: - print 'USAGE: %s ' - sys.exit(1) -if not os.path.exists(sys.argv[1]): - print 'BAM list %s not found' % sys.argv[1] - sys.exit(1) - -bam_list_filename = sys.argv[1] - -sample_summary_metrics_types = [ (HsMetrics,'hybrid_selection_metrics'), - (AlignmentSummaryMetrics, 'alignment_summary_metrics'), - (InsertSizeMetrics, 'insert_size_metrics'), - (DbSnpMatchMetrics, 'dbsnp_matches') ] - -header = ['sample','FINGERPRINT_LODS','HAPLOTYPES_CONFIDENTLY_MATCHING'] -for metric_type in sample_summary_metrics_types: - header.extend(get_sample_summary_metrics_fields(metric_type[0])) -print string.join(header,'\t') - -# get a representative BAM file for each sample, to use as a base path. Note that this assumes every sample corresponds to the same base path. -bam_list = open(bam_list_filename,'r') -samples = dict() - -for bam_filename in bam_list: - bam_filename = bam_filename.strip() - if bam_filename == '': - continue - bam_filename_tokens = bam_filename.split('/') - sample_id = bam_filename_tokens[len(bam_filename_tokens)-3] - samples[sample_id] = bam_filename -bam_list.close() - -for sample_id,filename in samples.items(): - basepath = filename[:filename.rindex('.bam')] - - fingerprinting_summary_metrics = get_all_metrics('%s.%s' % (basepath,'fingerprinting_summary_metrics')) - - if fingerprinting_summary_metrics != None: - haplotypes_confidently_matching = [str(metric.HAPLOTYPES_CONFIDENTLY_MATCHING) for metric in fingerprinting_summary_metrics] - fingerprint_lods = [str(metric.LOD_EXPECTED_SAMPLE) for metric in fingerprinting_summary_metrics] - else: - haplotypes_confidently_matching = [] - fingerprint_lods = [] - - data = [sample_id,'c('+string.join(fingerprint_lods,',')+')','c('+string.join(haplotypes_confidently_matching,',')+')'] - - for metrics_type,metrics_extension in sample_summary_metrics_types: - metrics_pathname = '%s.%s' % (basepath,metrics_extension) - if os.path.exists(metrics_pathname): - metrics = get_sample_summary_metrics(metrics_pathname) - data.extend([getattr(metrics, metrics_field_name) for metrics_field_name in get_sample_summary_metrics_fields(metrics_type)]) - else: - data.extend(['NA' for metrics_field_name in get_sample_summary_metrics_fields(metrics_type)]) - print string.join(['%s']*len(header),'\t')%tuple(data) diff --git a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py deleted file mode 100755 index a4f8dc246..000000000 --- a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py +++ /dev/null @@ -1,85 +0,0 @@ -import sys -import os -import re -import traceback -from optparse import OptionParser, OptionGroup -from IndentedHelpFormatterWithNL import * - - -# Init cmd-line args -description = """ -This script runs a command that concatenates all 50 per-chromosome files generated by the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator. -""" - -parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL()) - -parser.add_option("-d", "--directory", metavar="DIR", dest="directory", help="Specifies the directory that contains the files to concatenate (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/)") -parser.add_option("-f", "--filename-prefix", metavar="PREFIX", dest="prefix", help="Filename prefix (eg. refGene)") -parser.add_option("-u", "--ucsc", dest="ucsc", action="store_true", default=False, help="Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called 'chrM' instead of 'MT')).") -parser.add_option("-n", "--ncbi", dest="ncbi", action="store_true", default=False, help="Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called 'MT' instead of 'chrM')).") - -(options, args) = parser.parse_args() - - -def error(msg): - print("ERROR: %s. (Rerun with -h to print help info) \n" % msg) - parser.print_help() - sys.exit(-1) - -ucsc = options.ucsc -ncbi = options.ncbi - -if not ucsc and not ncbi: - error("Must run with either -u or -n") - -directory = options.directory -if not directory: - error("Must specify the directory using -d") - -prefix = options.prefix -if not prefix: - error("Must specify filename prefix using -f") - -contig_chars = [] -if ucsc: - contig_chars = ["M"] + range(1,23) + ["X", "Y"] -else: - contig_chars = range(1,23) + ["X", "Y", "M"] - - -contigs = [] -contigs += [ "chr" + str(x) for x in contig_chars ] - -if ucsc: # NCBI doesn't have the _random contigs - contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M','MT',12,14,20,'X','Y'])) ] # There's no _random chromosomes for chrM,12,14,20,Y - -#print(contigs) - -dir_plus_prefix = os.path.join(directory,prefix) - -# Update the *-big-table-header.txt header file using the header from one of the single-contig files - in case TranscriptToInfo was changed with columns being added or removed. -command = "head -n 1 " + dir_plus_prefix + ("-big-table-ucsc-%s.txt " % contigs[0]) + " > " + dir_plus_prefix + "-big-table-header.txt" -print(command) -os.system(command) - - -# Concatenate -header_start = open(dir_plus_prefix + "-big-table-header.txt").read().split("\t")[0] -command = "cat " -for contig in contigs: - command += dir_plus_prefix + "-big-table-ucsc-%s.txt " % contig - -command += " | grep -v " + header_start -if ncbi: - command += "| perl -pe 's/^chrM(.*)$/MT\\1/i' | perl -pe 's/^chr([^p].*)$/\\1/i' " # rename chrM to MT and remove the 'chr' from chromosome names - -command += " | cat " + dir_plus_prefix + "-big-table-header.txt - " - -if ucsc: - command += " > " + dir_plus_prefix + "-big-table-ucsc.txt" -else: - command += " > " + dir_plus_prefix + "-big-table-ncbi.txt" - -print(command) -os.system(command) - diff --git a/python/genomicAnnotatorScripts/ConvertTableToAnnotatorRod.py b/python/genomicAnnotatorScripts/ConvertTableToAnnotatorRod.py deleted file mode 100755 index 2e012b224..000000000 --- a/python/genomicAnnotatorScripts/ConvertTableToAnnotatorRod.py +++ /dev/null @@ -1,514 +0,0 @@ -import sys -import os -import re -import traceback -from optparse import OptionParser, OptionGroup -from IndentedHelpFormatterWithNL import * - -# Init cmd-line args -description = """ -This script takes a text-based tabular INPUT-FILE, validates it, and converts it into the format expected by the GenomicAnnotator. -More details can found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Tabular_Data_Format -""" -parser = OptionParser( description=description, usage="usage: %prog [options] INPUT-FILE", formatter=IndentedHelpFormatterWithNL()) -parser.add_option("-l", "--location-columns", metavar="COLUMNS", help="""The (1-based) column number(s) of the columns in INPUT-FILE that contain coordinates. \n -For example, '-l 2,3' means column #2 and column #3 contain coordinate info. COLUMNS can be set to one, two, or three comma-separated numbers:\n - 1 number means column1 is of the form 'choromosome:position' or 'chromosome:start-stop'\n - 2 numbers means column1 = choromosome, column2 = position.\n - 3 numbers means column1 = choromosome, column2 = start position, column3 = stop position.""") - -parser.add_option("-c", "--input-coords", dest="coordinates", metavar="COORD-TYPE", help="""Specifies the coordinate system of INPUT-FILE's chromosome/position column(s). COORD-TYPE can be:\n - * DONT-CHANGE coordinates will be left as-is.\n - * ONE-BASED-INCLUSIVE 1-based inclusive. Behavior the same as for DONT-CHANGE.\n - * POSITIONAL same as ONE-BASED-INCLUSIVE.\n - * ADD-ONE-TO-START-COORDS all start-coordinates will be incremented by one. - * ZERO-BASED-HALF-OPEN 0-based half-open. Behavior the same as for ADD-ONE-TO-START-COORDS\n - * OFFSET same as ZERO-BASED-HALF-OPEN.\n - * ADD-ONE-TO-START-AND-END-COORDS all start and end coordinates will be incremented by one. - * ZERO-BASED-INCLSUIVE 0-based half-open. Behavior the same as for ADD-ONE-TO-START-COORDS\n - - -Note: This setting is used to convert all coordinates into 1-based half-open for the output.""") -parser.add_option("-t", "--output-style", dest="sequence_build", metavar="BUILD", help="Sets the output file's reference build type to either UCSC or NCBI. This should be set based on what reference file will be used when running the GenomicAnnotator. UCSC builds can be specified as either 'hgXX' (eg. hg18) or 'UCSC'. NCBI builds can be specified as 'bXX' (eg. b36) or 'NCBI'. The build type determines chromosome order and naming convention (eg. 'chr1' or '1').") -#parser.add_option("-i", "--include-columns", dest="include_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of all columns to include in the outptut file. Any columns not in this list will be discarded.") -#parser.add_option("-e", "--exclude-columns", dest="exclude_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of the columns to include in the outptut file. Any columns not in this list will be discarded.") - -group = OptionGroup(parser, "Optional Args", " ") - -group.add_option("-o", "--output-filename", help="Output file path [Default: %default]", default="stdout") -group.add_option("-r", "--haplotype-reference-column", metavar="COLUMN", dest="haplotype_reference_column", help="1-based column number of the column to use as haplotypeReference. Specifying this will rename the column to 'haplotypeReference' in the header.") -group.add_option("-a", "--haplotype-alternate-column", metavar="COLUMN", dest="haplotype_alternate_column", help="1-based column number of the column to use as haplotypeAlternate. Specifying this will rename the column to 'haplotypeAlternate' in the header.") -group.add_option("-s", "--haplotype-strand-column", metavar="COLUMN", dest="haplotype_strand_column", help="1-based column number of the haplotypeStrand. Specifying this will rename the column to 'haplotypeStrand' in the header.") -group.add_option("-k", "--keep-original-columns", action="store_true", default=False, dest="keep_copy", help="This flag makes it so that the columns passed to -l, -r, -a, and -s args are not removed when their contents is used to generate the special columns (eg. 'chrpos', 'haplotypeReference', etc..).") -group.add_option("-m", "--other-start-columns", metavar="COLUMNS", dest="other_start_columns", help="Comma-separated list of 1 or more column numbers (1-based) representing other columns that contain start coordinates and need to be converted from the coordinate system specified by -c. For example, the refGene table has coordinates for cdsStart which need to be converted along with the chromosome, txStart, and txEnd columns.") -group.add_option("-n", "--other-end-columns", metavar="COLUMNS", dest="other_end_columns", help="Comma-separated list of 1 or more column numbers (1-based) representing other columns that contain end coordinates and need to be converted from the coordinate system specified by -c") -group.add_option("-v", "--verbose", action="store_true", default=False, help="Verbose.") -group.add_option("-d", "--delimiter", help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].") - - -parser.add_option_group(group) - -(options, args) = parser.parse_args() - - - -def error(msg): - print("ERROR: %s. (Rerun with -h to print help info) \n" % msg) - #parser.print_help() - sys.exit(-1) - -def warn(msg): - print("WARNING: %s" % msg) - -def fatal(msg): - print(msg) - sys.exit(-1) - - -def join_fields(fields): - return OUTPUT_FORMAT_DELIMITER.join(fields) - - -def split_line(line): - if delimiter: - return line.split(delimiter) - else: - return line.split() - -def line_key(line): - return chrpos_to_n( split_line(line) ) - - -# Computes an integer key for this line. These keys can be used to sort the lines by reference -def chrpos_to_n(lsplit): - # Get chr, pos from line - - chr_value, start_value, chr_prefix = None, None, '' # Init in case of error - try: - split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column. - chr_value = split1[0].lower().strip() - split2 = split1[1].split("-") - start_value = split2[0].lower().strip() - start_n = long(start_value) - stop_n = start_n - if len(split2) > 1: - stop_value = split2[1].lower().strip() - stop_n = long(stop_value) - #Become chr_prefix aware - if chr_value.count("chr"): - chr_prefix = "chr" - else: - chr_prefix = "" - except: - sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line)) - raise - - # Covert them to N - a = 0 - if sequence_build == "UCSC" and chr_value.count("_random"): - chr_value = chr_value.replace("_random", "") - a = 30 # Offset so that "random" chromosomes go last - - if sequence_build == "UCSC": - chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"0") - else: - chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"25") - - chr_n = a + int(chr_value.replace(chr_prefix+"x", chr_prefix+"23").replace(chr_prefix+"y", chr_prefix+"24").replace(chr_prefix,"")) + 1 - - N = (chr_n * 10L**23) + (start_n * 10L**11) + stop_n # Combine chr, start, stop into a single numeric key for sorting - - #print("N: " + str(N) + " line: " + line) - return N - -def is_valid_chrpos(line): - try: - # Compute the line key - line_key(line) - return True - except Exception, e: - #print(str(e)) - return False - - -# Takes a string containing a list of numbers. Returns the same string with all numbers incremented by one. -def increment_by_one(comma_separated_coords): - converted_coords_string = "" - for coord in comma_separated_coords.split(","): - if coord.strip() == "": - continue - - if len(converted_coords_string) > 0: - converted_coords_string += "," - - converted_coords_string += str(long(coord) + 1) - - return converted_coords_string - - - - -#print(args) # List of positional args. -#print(options.output_filename) -#print(options.coordinates) # required -#print(options.location_columns) -#print(options.delimiter) -#print(options.verbose) - - -# Validate and process cmd-line args - -verbose = options.verbose - -if verbose: - print("%s v0.9" % sys.argv[0]) - -delimiter = options.delimiter -if delimiter and delimiter.lower() == "tab": - delimiter = "\t" - -if len(args) < 1 or not os.access(args[0], os.R_OK): - error("Requires a valid INPUT-FILE") -input_filename = args[0] - - -add_one_to_start_coords = False -add_one_to_end_coords = False -if options.coordinates == "DONT-CHANGE" or options.coordinates == "ONE-BASED-INCLUSIVE" or options.coordinates == "POSITIONAL": - pass -elif options.coordinates == "ADD-ONE-TO-START-COORDS" or options.coordinates == "ZERO-BASED-HALF-OPEN" or options.coordinates == "OFFSET": - add_one_to_start_coords = True -elif options.coordinates == "ADD-ONE-TO-START-AND-END-COORDS" or options.coordinates == "ZERO-BASED-INCLUSIVE": - add_one_to_start_coords = True - add_one_to_end_coords = True -else: - if not options.coordinates: - error("-c arg must be specified") - else: - error("Invalid -c value: %s" % str(options.coordinates)) - -if not options.location_columns: - error("-l arg must be specified") - -loc_columns = options.location_columns.split(",") -if len(loc_columns) < 1 or len(loc_columns) > 3: - error("-l COLUMNS must specify a comma-separated list of between 1 and 3 numbers.") - -#if verbose: -# print("Parsed -c: " + str(loc_columns)) - -try: - chr_column = int(loc_columns[0]) - 1 - start_column = None - stop_column = None - columns_to_be_moved_in_order = [chr_column] - if len(loc_columns) > 1: - start_column = long(loc_columns[1]) - 1 - columns_to_be_moved_in_order += [start_column] - - if len(loc_columns) > 2: - stop_column = long(loc_columns[2]) - 1 - columns_to_be_moved_in_order += [stop_column] - -except: - error("-l COLUMNS - all elements in the comma-separated list must be integers.") - -if (chr_column and chr_column < 0) or (start_column and start_column < 0) or (stop_column and stop_column < 0): - error("-l COLUMNS - all elements in the comma-separated list must be >= 1") - - -if not options.sequence_build: - error("-t arg must be specified") - -sequence_build = options.sequence_build.lower() -if sequence_build.startswith("b") or sequence_build == "ncbi": - sequence_build = "NCBI" -elif sequence_build.startswith("hg") or sequence_build == "ucsc": - sequence_build = "UCSC" -else: - error("-t arg must be one of these: 'hgXX' (eg. hg18), 'UCSC', 'bXX' (eg. b36), 'NCBI'") - - -haplotype_reference_column = None -if options.haplotype_reference_column: - try: - haplotype_reference_column = int(options.haplotype_reference_column) - 1 - columns_to_be_moved_in_order += [haplotype_reference_column] - except: - error("-r arg must be an integer") - - -haplotype_alternate_column = None -if options.haplotype_alternate_column: - try: - haplotype_alternate_column = int(options.haplotype_alternate_column) - 1 - columns_to_be_moved_in_order += [haplotype_alternate_column] - except: - error("-a arg must be an integer") - -haplotype_strand_column = None -if options.haplotype_strand_column: - try: - haplotype_strand_column = int(options.haplotype_strand_column) - 1 - columns_to_be_moved_in_order += [haplotype_strand_column] - except: - error("-s arg must be an integer") - -keep_copy = options.keep_copy - -output_filename = options.output_filename -if output_filename and output_filename != "stdout" and output_filename != "-" and os.access(output_filename, os.F_OK) and not os.access(output_filename, os.W_OK): - error("Unable to write to: %s" % str(options.output_filename)) - - -other_start_columns = [] -if options.other_start_columns: - try: - for c in options.other_start_columns.split(","): - other_start_columns += [int(c) - 1] - except: - error("-m COLUMNS - all elements in the comma-separated list must be integers.") - -other_end_columns = [] -if options.other_end_columns: - try: - for c in options.other_end_columns.split(","): - other_end_columns += [int(c) - 1] - except: - error("-n COLUMNS - all elements in the comma-separated list must be integers.") - -if verbose: - print(" Input file: " + input_filename) - print(" Output file: " + output_filename) - - -columns_to_be_moved_in_order.sort(reverse=True) - - - -# Commence processing -counter = 0 -skipped_lines_counter = 0 -header_line_found = False -header_fields = [] -prepend_lines = [] -data_lines = [] -previous_n = -1 # Checks whether data is in order -need_to_sort = False - -OUTPUT_FORMAT_DELIMITER = "\t" - -for line in open(input_filename): - line = line.strip() - if counter % 1000000 == 0 and counter > 0: - print("Processed %d records" % counter ) - - counter+=1 - if not header_line_found: - if line.startswith("#") or line == "": - prepend_lines += [line] - else: - header_line_found = True - header_fields = split_line(line) - - if len(header_fields) < 2: - error("Header appears to have only 1 column in it.") - - # Remove all the existing positional columns, and make a new 1st column: 'chrpos' - if haplotype_reference_column and haplotype_reference_column >= len(header_fields): - error("Found only %d headers. -r arg is out of range." % len(header_fields) ) - - if haplotype_alternate_column and haplotype_alternate_column >= len(header_fields): - error("Found only %d headers. -a arg is out of range." % len(header_fields) ) - - if haplotype_strand_column and haplotype_strand_column >= len(header_fields): - error("Found only %d headers. -s arg is out of range." % len(header_fields) ) - - for c in columns_to_be_moved_in_order: - if c >= len(header_fields): - error("Found only %d headers. Column %d (specified as part of -l or another COLUMN arg) is out of range." % (len(header_fields), c ) ) - - if not keep_copy: - header_fields.pop(c) - - - - # Rename columns to haplotypeReference, haplotypeAlternate, haplotypeStrand, and move them so that they are the 2nd,3rd,4th columns: - if haplotype_strand_column: - header_fields.insert(0, "haplotypeStrand") - - if haplotype_alternate_column: - header_fields.insert(0, "haplotypeAlternate") - - if haplotype_reference_column: - header_fields.insert(0, "haplotypeReference") - - header_fields.insert(0, "chrpos") - - if verbose: - print("Found header containing %d columns: [%s]. Changed it to: [%s]." % (len(split_line(line)), " ".join(split_line(line)), " ".join(header_fields))) - - else: - # This is a data line - line_fields = split_line(line) - - # Get the haplotype ref/alt/strand values - if haplotype_reference_column: - haplotype_reference_value = line_fields[haplotype_reference_column] - - if haplotype_alternate_column: - haplotype_alternate_value = line_fields[haplotype_alternate_column] - - if haplotype_strand_column: - haplotype_strand_value = line_fields[haplotype_strand_column] - - - # Compute the chrpos value from the chr,start,stop columns - chrpos_value = "" - if start_column: - # There is more than 1 column of position info in this line - # TODO error check line_fields[chr_column] somehow. - try: start_int = long(line_fields[start_column]) - except: error("Line #%d, Column %d: start coordinate value '%s' is not an integer." % (counter, start_column, line_fields[start_column])) - - if add_one_to_start_coords: - start_int += 1 # Convert to 1-based coords - line_fields[start_column] = str(start_int) # Change the original column in case keep_copy is True - - chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int ) - #@JAMES@ - #print(chrpos_value) - #Become chr_prefix aware - if chrpos_value.count("chr"): - chr_prefix = "chr" - else: - chr_prefix = "" - - if sequence_build == "UCSC" and chr_prefix == "chr": - chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int ) - elif sequence_build == "UCSC" and chr_prefix != "chr": - chrpos_value = "chr%s:%d" % ( line_fields[chr_column], start_int ) - elif sequence_build == "NCBI" and chr_prefix == "chr": - chrpos_value = "%s:%d".replace("chr","") % ( line_fields[chr_column], start_int ) - elif sequence_build == "NCBI" and chr_prefix != "chr": - chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int ) - - #/JAMES - if stop_column: - try: stop_int = long(line_fields[stop_column]) - except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column])) - - if add_one_to_end_coords: - stop_int += 1 # Convert to 1-based coords - line_fields[stop_column] = str(stop_int) # Change the original column in case keep_copy is True - - if stop_int != start_int: # If they are equal, chr1:x is the same as chr1:x-x - chrpos_value += "-%d" % stop_int - else: - # There is only 1 column of position info in this line - if not re.match(".+\:\d+([-]\d+)?", line_fields[chr_column]): - error("Line #%d: Invalid chrpos [%s] in column %d" % (counter, line_fields[chr_column], chr_column )) - - # Handle the -m arg - if other_start_columns and add_one_to_start_coords: - for c in other_start_columns: - if c >= len(line_fields): - error("Line #%d: Found only %d fields. -m arg is out of range." % (counter, len(line_fields)) ) - - try: - line_fields[c] = increment_by_one(line_fields[c]) - except: - error( "Line #%d: Processing -m %s arg. Couldn't parse coordinates in column %d: [%s]." % (counter, str(other_start_columns), c, line_fields[c] ) ) - # Handle the -n arg - if other_end_columns and add_one_to_end_coords: - for c in other_end_columns: - if c >= len(line_fields): - error("Line #%d: Found only %d fields. -m arg is out of range." % (counter, len(line_fields)) ) - - try: - line_fields[c] = increment_by_one(line_fields[c]) - except: - error( "Line #%d: Processing -m %s arg. Couldn't parse coordinates in column %d: [%s]." % (counter, str(other_end_columns), c, line_fields[c] ) ) - - - # Move the columns around as needed (eg. so that chrpos is in the 1th column and hap ref/alt/strand are 2nd,3rd,4th): - if not keep_copy: - for c in columns_to_be_moved_in_order: - line_fields.pop(c) - - - if haplotype_strand_column: - line_fields.insert(0, haplotype_strand_value) - - if haplotype_alternate_column: - line_fields.insert(0, haplotype_alternate_value) - - if haplotype_reference_column: - line_fields.insert(0, haplotype_reference_value) - - line_fields.insert(0, chrpos_value) - - - - # Validate - if len(line_fields) < len(header_fields): - warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. The missing fields will be treated as empty." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) - while len(line_fields) < len(header_fields): - line_fields += [OUTPUT_FORMAT_DELIMITER + ""] # Append '' as filler. TODO - make this behavior a cmd-line switchable - - elif len(line_fields) > len(header_fields): - warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. Skipping..." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) - continue - - - try: - n = chrpos_to_n(line_fields) - if not need_to_sort and n < previous_n: - need_to_sort = True - warn("Line %d is out of order. Will need to sort all lines." % counter) - previous_n = n - except Exception, e: - warn("Couldn't parse line: " + " ".join(line_fields) + ". " +str(e) + ". Skipping...") - if verbose: traceback.print_exc() - skipped_lines_counter += 1 - continue - - - - - data_lines += [ join_fields(line_fields) ] - -if verbose and skipped_lines_counter: - print("Skipped %d / %d lines. (%f%%)" % (skipped_lines_counter, counter, skipped_lines_counter/float(counter))) -if need_to_sort: - if verbose: - print("Sorting %d lines..." % len(data_lines)) - - data_lines.sort(key=line_key) - - -if verbose: - print("Writing data to: " + output_filename) - -# Write output file -if output_filename == "stdout" or output_filename == "-": - output_file = sys.stdout -else: - output_file = open(output_filename, "w+") - -for line in prepend_lines + [ join_fields(header_fields) ]: - output_file.write(line + "\n") - -for line in data_lines: - if sequence_build == "NCBI" and line.lower().startswith("chr"): - if line.lower().startswith("chrm"): - output_file.write("MT" + line[4:] + "\n") - else: - output_file.write(line[3:] + "\n") - else: - #if sequence_build == "UCSC": - # output_file.write("chr" + line + "\n") - #else: - output_file.write(line + "\n") - -output_file.close() diff --git a/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py b/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py deleted file mode 100755 index 8d49d7592..000000000 --- a/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py +++ /dev/null @@ -1,62 +0,0 @@ - -from optparse import IndentedHelpFormatter -import textwrap - - -# Taken from http://groups.google.com/group/comp.lang.python/browse_frm/thread/6df6e6b541a15bc2 -# This code makes optparse keep line-breaks in help strings. -class IndentedHelpFormatterWithNL(IndentedHelpFormatter): - def format_description(self, description): - if not description: return "" - desc_width = self.width - self.current_indent - indent = " "*self.current_indent - - # the above is still the same - bits = description.split('\n') - formatted_bits = [ - textwrap.fill(bit, - desc_width, - initial_indent=indent, - subsequent_indent=indent) - for bit in bits] - result = "\n".join(formatted_bits) + "\n" - return result - def format_option(self, option): - # The help for each option consists of two parts: - # * the opt strings and metavars - # eg. ("-x", or "-fFILENAME, --file=FILENAME") - # * the user-supplied help string - # eg. ("turn on expert mode", "read data from FILENAME") - # - # If possible, we write both of these on the same line: - # -x turn on expert mode - # - # But if the opt string list is too long, we put the help - # string on a second line, indented to the same column it would - # start in if it fit on the first line. - # -fFILENAME, --file=FILENAME - # read data from FILENAME - result = [] - opts = self.option_strings[option] - opt_width = self.help_position - self.current_indent - 2 - if len(opts) > opt_width: - opts = "%*s%s\n" % (self.current_indent, "", opts) - indent_first = self.help_position - else: # start help on same line as opts - opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts) - indent_first = 0 - result.append(opts) - if option.help: - help_text = self.expand_default(option) -# Everything is the same up through here - help_lines = [] - for para in help_text.split("\n"): - help_lines.extend(textwrap.wrap(para, self.help_width)) -# Everything is the same after here - result.append("%*s%s\n" % ( - indent_first, "", help_lines[0])) - result.extend(["%*s%s\n" % (self.help_position, "", line) - for line in help_lines[1:]]) - elif opts[-1] != "\n": - result.append("\n") - return "".join(result) diff --git a/python/genomicAnnotatorScripts/MergeTwoTables.py b/python/genomicAnnotatorScripts/MergeTwoTables.py deleted file mode 100644 index 13d8289ad..000000000 --- a/python/genomicAnnotatorScripts/MergeTwoTables.py +++ /dev/null @@ -1,155 +0,0 @@ -import sys -import os - - -def print_help(): - sys.stderr.write("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \ - " Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \ - " Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \ - " and must have identical headers.\n") - -def read_header(file_obj): - for line in file_obj: - line = line[0:-1] # Remove trailing \n - if line.strip() != "" and line[0] != "#": - return line.split("\t") - else: - raise Exception, "Reached the end of the file without finding the header" - - - - -if len(sys.argv) != 3: - print_help() - sys.exit(0) - -try: - file1 = open(sys.argv[1]) - header1 = read_header(file1) -except Exception, e: - sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n") - sys.exit(0) - -try: - file2 = open(sys.argv[2]) - header2 = read_header(file2) -except Exception, e: - sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n") - sys.exit(0) - - -if len(header1) != len(header2): - sys.stderr.write("ERROR: The two files' headers are of different lengths: \n" + str(header1) + "\n" + str(header2) + "\n") - sys.exit(0) - -if header1 != header2: - sys.stderr.write("WARNING: The two files' headers are not the same: \nHeader1: " + str(header1) + "\nHeader2: " + str(header2) + "\nUsing header1.\n") -print("\t".join(header1)) - - -def get_chrom(line): - idx = line.find(":") - if idx == -1: - raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line - chrom = line[0:idx] - return chrom - -# Computes a sort key for chromosome names (UCSC order) -def compute_chrom_key(chr_value): - a = 0 - chr_value = chr_value.lower() - - if chr_value.count("_random"): - chr_value = chr_value.replace("_random", "") - a = 30 # Offset so that "random" chromosomes go last - - chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24") - chr_value = chr_value.replace("chr","") - return a + int(chr_value) + 1 - -def compute_sort_key(line): - idx = line.find('\t') - if idx == -1: - chrpos = line - else: - chrpos = line[0:idx] - - idx = chrpos.find(":") - if idx == -1: - return chrpos - chrom = chrpos[0:idx] - pos = chrpos[idx+1:] - - idx = pos.find("-") - if idx == -1: - return int(pos) - else: - start = pos[0:idx] - end = pos[idx+1:] - return int(start) - - -def read_line(file_obj): - try: - line = file_obj.next()[0:-1] # Remove \n - key = compute_sort_key(line) - return (line, key) - except StopIteration: - return (None, None) - except Exception, e: - sys.stderr.write("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e) + "\n") - sys.exit(0) - - -# Read the 1st lines of each file -line1, key1 = read_line(file1) -line2, key2 = read_line(file2) - - -# Do a merge sort -while line1 != None or line2 != None: # Iterate over each chromosome - # Compute the next chromosome - if line1 != None and line2 != None: - chrom1 = get_chrom(line1) - chrom2 = get_chrom(line2) - if compute_chrom_key(chrom1) < compute_chrom_key(chrom2): - current_chrom = chrom1 - else: - current_chrom = chrom2 - elif line1 != None: - current_chrom = get_chrom(line1) - elif line2 != None: - current_chrom = get_chrom(line2) - - # Iterate over lines for that chromosome - while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom: - - if key2 > key1: - print(line1) - #print("line1 - key1: " + str(key1) + " key2: " + str(key2)) - used_line1 = True - else: - #print("line2 - key1: " + str(key1) + " key2: " + str(key2)) - print(line2) - used_line1 = False - - if used_line1: - line1, key1 = read_line(file1) - else: - line2, key2 = read_line(file2) - - - - # At this point, either line1 or line2 will == None - - while line1 != None and get_chrom(line1) == current_chrom: - print(line1) - line1, key1 = read_line(file1) - - while line2 != None and get_chrom(line2) == current_chrom: - print(line2) - line2, key2 = read_line(file2) - - - - diff --git a/python/getBamFilesFromSpreadsheet.py b/python/getBamFilesFromSpreadsheet.py deleted file mode 100755 index bb6980780..000000000 --- a/python/getBamFilesFromSpreadsheet.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -import sys -import os - -bamfile_base = "/seq/picard_aggregation/" -fingerprint_base = "/seq/references/reference_genotypes/non-hapmap/" -hg18_reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" -hg18_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_hg18.rod" -b36_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_b36.rod" -b36_reference = "/broad/1KG/reference/human_b36_both.fasta" -hg18_intervals = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list" -hg18_intervals = "/humgen/gsa-hpprojects/FHS/indexed/interval_lists/fhs_jhs_pilot.targets.interval_list" -b36_intervals = "" - -min_base_q = "10" -min_map_q = "10" -max_reads = "1000000" -min_conf = "50" -variant_expression = "QUAL <= 50.0 || AB > 0.75 || QD < 5.0 || HRun > 3" -spreadsheetPath = sys.argv[3] -projectName = sys.argv[2] -groupName = sys.argv[1] -reference = sys.argv[4] -filter_name = projectName+"_Initial_Filter" -if ( reference != "hg18" and reference != "b36" ): - raise ValueError("Illegal reference type") -elif ( reference == "hg18" ): - reference = hg18_reference - dbsnp = hg18_dbsnp - intervals = hg18_intervals - fpref = "Homo_sapiens_assembly18" -else: - reference = b36_reference - dbsnp = b36_dbsnp - intervals = b36_intervals - fpref = "human_b36" - -outputFile = projectName+"_bam_files.txt" -OUTPUT_HEADER = ["sample_id","recalibrated_bam_file","individual_id","fingerprint_file","reference_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"] -OUTPUT_HEADER_INDIVIDUAL = ["reference_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"] - -if ( spreadsheetPath.find("/") > -1 ): - newSpreadsheet = spreadsheetPath.rsplit("/",1)[1].rsplit(".",1)[0]+"_proper_format.tsv" -else: - newSpreadsheet = spreadsheetPath.rsplit(".",1)[0]+"_proper_format.tsv" - -# convert to proper format -os.system("sed 's/\\r/\\n/g' "+spreadsheetPath+" > "+newSpreadsheet) - -project_info = open(newSpreadsheet) - -header = project_info.readline().strip().split("\t") - -project_index = header.index("Project") -sample_index = header.index("Sample") -status_index = header.index("Sample Status") - -def versionCompare(version1,version2): - return -int(version1.split("v")[1])+int(version2.split("v")[1]) - -def getNewestVersion(baseDir): - versions = os.listdir(baseDir) - versions.sort(versionCompare) - for version in versions: - if ( "finished.txt" in os.listdir(baseDir+version+"/") ): - return version - -outputFile = open(outputFile,'w') -outputFile.write("\t".join(OUTPUT_HEADER)+"\n") - -for line in project_info.readlines(): - if ( not line.startswith("\n") and not line.startswith(" ") and not line.startswith("\t") ): - spline = line.strip().split("\t") - versioningDirectory = bamfile_base+spline[project_index]+"/"+spline[sample_index]+"/" - version = getNewestVersion(versioningDirectory) - bamfile = versioningDirectory+version+"/"+spline[sample_index]+".bam" - fingerprint_path = fingerprint_base+spline[project_index]+"/"+fpref+"/" - if ( os.path.isdir(fingerprint_path) and spline[sample_index]+".fingerprint.geli" in os.listdir(fingerprint_path) ): - fingerprint_file = fingerprint_path+spline[sample_index]+".fingerprint.geli" - else: - fingerprint_file = "" - if ( spline[status_index] == "Complete" ): - outputFile.write(projectName+"_"+spline[sample_index]+"\t"+bamfile+"\t"+groupName+"\t"+fingerprint_file+"\t"+reference+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_map_q+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n") - -outputFile.close() -outputFile = open(projectName+"_Project_Entry.txt",'w') -outputFile.write("individual_set_id\n") -outputFile.write(projectName) -outputFile.close() -outputFile = open(projectName+"_Population_Entry.txt",'w') -outputFile.write("individual_id\tindividual_set_id\t"+"\t".join(OUTPUT_HEADER_INDIVIDUAL)+"\n") -outputFile.write(groupName+"\t"+projectName+"\t"+reference+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n") diff --git a/python/getLaneAndSequenceInfo.py b/python/getLaneAndSequenceInfo.py deleted file mode 100644 index 073be0ad1..000000000 --- a/python/getLaneAndSequenceInfo.py +++ /dev/null @@ -1,158 +0,0 @@ -lanes_file = open("/humgen/gsa-hpprojects/FHS/indexed/production/oneOffAnalyses/coverage_4_14/docs/fhs_squid_lanes_4_15.txt") -samples_file = open("/humgen/gsa-hpprojects/FHS/indexed/production/oneOffAnalyses/coverage_4_14/docs/fhs_squid_samples_4_15.txt") -lanes_header = lanes_file.readline().split("\t") -import os -import time -DEBUG = False -DEBUG_RECORDS = 50 -# dumb TSV often saves with strings like "190,12" -def str2num(strn): - if ( strn == "" or strn == "''" or strn == '""' ): - return -1 - elif ( strn.startswith('"') ): - return str2num(strn.split('"')[1]) - elif ( strn.find(",") > -1): - return str2num(strn.replace(",","")) - elif ( strn.find(".") > -1 ): - return float(strn) - else: - try: - return int(strn) - except ValueError: - print("Odd format for int: "+strn) - exit() - -def getCaptureDate(flow,lane,lib): - __f = os.listdir("/seq/picard/"+flow+"/")[0] - checkdir = "/seq/picard/"+flow+"/"+__f+"/"+lane+"/"+lib+"/" - earlydate = None - for file in os.listdir(checkdir): - ctime = os.stat(checkdir+file) - ctime = ctime[len(ctime)-1] - if ( earlydate == None or ctime < earlydate ): - earlydate = ctime - return ctime - -class LaneRecord: - def __init__(self,line): - spline = line.strip().split("\t") - self.flowcell = spline[lanes_header.index("Flowcell")] - self.lane_number = spline[lanes_header.index("Lane")] - self.sample_id = spline[lanes_header.index("External ID")] - self.lane_id = self.flowcell+"."+self.lane_number - self.library = spline[lanes_header.index("Library")] - self.aligned_reads = str2num(spline[lanes_header.index("AL_PF_HQ_ALIGNED_READS")]) - self.read_length = str2num(spline[lanes_header.index("AL_MEAN_READ_LENGTH")]) - self.dup_pct = str2num(spline[lanes_header.index("DUP_PERCENT_DUPLICATION")]) - self.hs_lib_size = str2num(spline[lanes_header.index("HS_LIBRARY_SIZE")]) - self.hs_pf_uq_reads = str2num(spline[lanes_header.index("HS_PF_UNIQUE_READS")]) - self.picard_snps = str2num(spline[lanes_header.index("SNP_TOTAL_SNPS")]) - try: - self.ic_rd2_error_rate = str2num(spline[lanes_header.index("Lane IC PCT Mean RD2 Err Rate")]) - except IndexError: - print(self.lane_id) - self.ic_rd2_error_rate = 0.5 - self.date_run = getCaptureDate(self.flowcell,self.lane_number,self.library) - -class LaneRecordAggregator: - def __init__(self,by_type): - self.by = by_type - self.entries = dict() - - def inc(self,rec): - if ( self.by == "sample" ): - if ( rec.sample_id in self.entries.keys() ): - self.entries[rec.sample_id].add(rec) - else: - self.entries[rec.sample_id] = set() - self.entries[rec.sample_id].add(rec) - elif ( self.by == "flowcell" ): - if ( rec.flowcell in self.entries.keys() ): - self.entries[rec.flowcell].add(rec) - else: - self.entries[rec.flowcell] = set() - self.entries[rec.flowcell].add(rec) - elif ( self.by == "lane_number" ): - if ( rec.lane_number in self.entries.keys() ): - self.entries[rec.lane_number].add(rec) - else: - self.entries[rec.lane_number] = set() - self.entries[rec.lane_number].add(rec) - elif ( self.by == "library" ): - if ( rec.library in self.entries.keys() ): - self.entries[rec.lane_number].add(rec) - else: - self.entries[rec.lane_number] = set() - self.entries[rec.lane_number].add(rec) - else: - print("You use sample, flowcell, or lane_number. You should hit ctrl-c now.") - - def summaryTable(self): - strOut = "\tNumBarcodes\tDate\tReads\tLength\tDup_Pct\tLib_Size\tHS_pf_unique_reads\trd2_error_rate\tsnps" - for id in self.entries.keys(): - strOut += "\n"+id - lanes = self.entries[id] - avg_date = 0.0 - avg_reads = 0.0 - avg_length = 0.0 - avg_dup = 0.0 - avg_lib_size = 0.0 - avg_hs_pf = 0.0 - avg_err_rt = 0.0 - avg_snps = 0.0 - for lane in lanes: - avg_date += lane.date_run - avg_reads += lane.aligned_reads - avg_length += lane.read_length - avg_dup += lane.dup_pct - avg_lib_size += lane.hs_lib_size - avg_hs_pf += lane.hs_pf_uq_reads - avg_err_rt += lane.ic_rd2_error_rate - avg_snps += lane.picard_snps - avg_date = avg_date/len(lanes) - avg_reads = avg_reads/len(lanes) - avg_length = avg_length/len(lanes) - avg_dup = avg_dup/len(lanes) - avg_lib_size = avg_lib_size/len(lanes) - avg_hs_pf = avg_hs_pf/len(lanes) - avg_err_rt = avg_err_rt/len(lanes) - avg_snps = avg_snps/len(lanes) - strOut += "\t"+"\t".join([str(len(lanes)),str(time.asctime(time.gmtime(int(round(avg_date))))),str(avg_reads),str(avg_length),str(avg_dup),str(avg_lib_size),str(avg_hs_pf),str(avg_err_rt),str(avg_snps)]) - return strOut - -by_flowcell_metrics = LaneRecordAggregator("flowcell") -by_lane_metrics = set() -by_lane_number_metrics = LaneRecordAggregator("lane_number") -by_sample_metrics = LaneRecordAggregator("sample") -by_library_metrics = LaneRecordAggregator("library") -line_no = 0 -for line in lanes_file.readlines(): - lane = LaneRecord(line) - by_lane_metrics.add(lane) - by_flowcell_metrics.inc(lane) - by_lane_number_metrics.inc(lane) - by_sample_metrics.inc(lane) - by_library_metrics.inc(lane) - line_no += 1 - if ( DEBUG and line_no > DEBUG_RECORDS ): - break - if ( line_no % 100 == 0 ): - print("Read: "+str(line_no)+" lines.") - -out = open("flowcell_metrics.txt",'w') -out.write(by_flowcell_metrics.summaryTable()) -out.close() -out = open("sample_metrics.txt",'w') -out.write(by_sample_metrics.summaryTable()) -out.close() -out = open("lane_number_metrics.txt",'w') -out.write(by_lane_number_metrics.summaryTable()) -out.close() -out = open("library_metrics.txt",'w') -out.write(by_library_metrics.summaryTable()) -out.close() -out = open("per_lane_metrics.txt",'w') -out.write("\t"+"\t".join(["lane_id","date","reads","length","dups","lib_size","hs_reads_pf_uq","rt2_err","snps"])) -for lane in by_lane_metrics: - out.write("\n"+"\t".join([str(lane.lane_id),str(time.asctime(time.gmtime(int(round(lane.date_run))))),str(lane.aligned_reads),str(lane.dup_pct),str(lane.hs_lib_size),str(lane.hs_pf_uq_reads),str(lane.ic_rd2_error_rate),str(lane.picard_snps)])) -out.close() diff --git a/python/getRecentBamList.py b/python/getRecentBamList.py deleted file mode 100644 index 97f382797..000000000 --- a/python/getRecentBamList.py +++ /dev/null @@ -1,30 +0,0 @@ -import sys -import os - -in_list = sys.argv[1] -out_list = sys.argv[2] - -inp = open(in_list) -out = open(out_list,'w') - -for line in inp.readlines(): - path = line.strip() - #print(path) - tries = 0 - index = path.strip(".bam")+".bai" - finished = path.rsplit("/",1)[0]+"/finished.txt" - while ( not (os.path.exists(path) and os.path.exists(finished) and os.path.exists(index) ) and tries < 15 ): - base = path.rsplit("/",1)[0] - vers = base.rsplit("/",1)[1] - base = base.rsplit("/",1)[0] - bam = path.rsplit("/",1)[1] - vers = "v%d" % (int(vers.lstrip("v"))+1) - path = "%s/%s/%s" % (base,vers,bam) - finished = "%s/%s/%s" % (base,vers,"finished.txt") - index = path.strip(".bam")+".bai" - #print(path) - tries += 1 - if ( os.path.exists(path) and os.path.exists(finished) and os.path.exists(index) ): - out.write(path+"\n") - else: - print("No filepath on the file system contains bam, index, and finished.txt for entry "+path) diff --git a/python/getTargetedGenes.py b/python/getTargetedGenes.py deleted file mode 100755 index e7725fba9..000000000 --- a/python/getTargetedGenes.py +++ /dev/null @@ -1,216 +0,0 @@ -import RefseqLibrary as exLib - -def chr2int(chr): - chr = chr.split("chr")[1] - if ( chr == "M" ): - return 0 - if ( chr == "X" ): - return 23 - if ( chr == "Y" ): - return 24 - return int(chr) - -def intervalCompareExon(exon1,exon2): - int1 = exon1.interval - int2 = exon2.interval - return intervalCompare(int1,int2) - -def intervalCompareExonTarget(exon1,target): - int1 = exon1.interval - return intervalCompare(int1,target) - -def intervalCompare(int1,int2): - chr1 = chr2int(int1.chromosome) - chr2 = chr2int(int2.chromosome) - if ( chr1 < chr2 ): - return -1 - elif ( chr1 > chr2 ): - return 1 - else: - start1 = int1.start - start2 = int2.start - return start1 - start2 - -test_first = exLib.Interval("chr2",100000,100001) -test_second = exLib.Interval("chr2",100001,100004) -shouldBeNegative = intervalCompare(test_first,test_second) -if ( not shouldBeNegative < 0 ): - raise ValueError("Interval compare is not performing properly.") - -test_first = exLib.CoveredInterval("chr2",10400,10800) -test_first.updateCoverage(exLib.Interval("chr2",10387,10420)) -test_first.updateCoverage(exLib.Interval("chr2",10410,10560)) -test_first.updateCoverage(exLib.Interval("chr2",10600,10820)) -test_first.updateCoverage(exLib.Interval("chr2",10890,10990)) -if ( test_first.getBaseCoverage() != 360 ): - print("Base coverage of test was "+str(test_first.getBaseCoverage())+" (400 expected)") - print("Testing intersection....") - g = exLib.Interval("chr2",10387,10420).intersect(exLib.Interval("chr2",10410,10560)) - if ( g.chromosome != "chr2" or g.start != 10410 or g.stop != 10420 ): - print("Bad intersection! "+str(g)) - if ( g.size() != 10 ): - print("Size not performing correctly") - raise ValueError("Covered interval is not performing properly") - -refseq_exons = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.cds.bed") -refseq_utr3 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr3.bed") -refseq_utr5 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr5.bed") -cancer_6000 = open("/seq/references/HybSelOligos/tcga_6k_genes.design") - -counter = 0 -exons = list() -def isNotValid(rline): - try: - n = chr2int(line.split()[0]) - except ValueError: - return True - return False - -def parseLine(rline,token): - spline = line.strip().split() - chrom = spline[0] - start = int(spline[1]) - stop = int(spline[2]) - longname = spline[3] - geneName = longname.split("_"+token)[0] - exonID = longname.split("_"+token+"_")[1].split("_chr")[0]+"_"+"_".join([token,longname[len(longname)-1]]) - return exLib.Exon(geneName,exonID,chrom,start,stop) - -for line in refseq_exons.readlines(): - if ( isNotValid(line) ): - continue - counter = 1 + counter - exons.append(parseLine(line,"cds")) - if ( counter % 25000 == 0 ): - print(str(counter)+" exons added") - -for line in refseq_utr3: - if ( isNotValid(line) ): - continue - counter = 1 + counter - exons.append(parseLine(line,"utr3")) - if ( counter % 25000 == 0 ): - print(str(counter)+" exons added") - -for line in refseq_utr5: - if ( isNotValid(line) ): - continue - counter = 1 + counter - exons.append(parseLine(line,"utr5")) - if ( counter % 25000 == 0 ): - print(str(counter)+" exons added") - -for line in cancer_6000: - if ( not line.startswith("TARGET") ): - continue - counter = 1 + counter - spline = line.strip().split() - chrom = "chr"+spline[1] - start = int(spline[2]) - stop = int(spline[3]) - longname = spline[4] - gene_name = longname.split("_")[0].split("#")[1] - exon_id = "tcga_"+longname.split("_")[1] - exons.append(exLib.Exon(gene_name,exon_id,chrom,start,stop)) - if ( counter % 25000 == 0 ): - print(str(counter)+" exons added") - -exons.sort(intervalCompareExon) -print(str(len(exons))+" exons added") -start_index = 0 -counter = 0 -import sys -interval_list = sys.argv[1] -headerlines = list() -for line in open(interval_list).readlines(): - if ( not line.startswith("@") and not line.startswith("#") ): - counter = 1 + counter - s = line.split() - target = exLib.Interval(s[0],int(s[1]),int(s[2])) - while ( exons[start_index].isBefore(target) and start_index < len(exons)-1): - start_index = 1 + start_index - index = start_index - while(exons[index].overlaps(target) and index < len(exons) - 1): - exons[index].updateCoverage(target) - index = 1 + index - if ( counter % 25000 == 0 ): - print("Read "+str(counter)+" lines from interval list") - #if ( counter % 5000 == 0 ): - # break - else: - if ( line.startswith("@") ): - headerlines.append(line) - -print("Done reading interval file. Creating genes.") -exons.sort(key = lambda x: x.gene) -genes = list() -counter = 0 -prevName = "@T#h12_3" -nGenes = 0 -for exon in exons: - counter = 1 + counter - if ( exon.gene != prevName ): - genes.append(exLib.Gene(exon.gene)) - nGenes = 1 + nGenes - genes[nGenes - 1].addExon(exon) - prevName = exon.gene - else: - genes[nGenes - 1].addExon(exon) - if ( counter % 20000 == 0 ): - print("Processed "+str(counter)+" exons. "+str(len(genes))+" gene records created.") - -def byProportion(gene1,gene2): - gene1targets = gene1.size() - gene1cvg = gene1.getBaseCoverage() - gene1prop = float(gene1cvg)/float(gene1targets) - gene2targets = gene2.size() - gene2cvg = gene2.getBaseCoverage() - gene2prop = float(gene2cvg)/float(gene2targets) - if ( gene1prop > gene2prop ): - return -1 - elif ( gene1prop < gene2prop ): - return 1 - return 0 - -genes.sort(byProportion) - -print("Writing coverage table...") -coverage = open("geneCoverage.txt",'w') -counter = 0 -exonsInWellCoveredGenes = list() -for gene in genes: - try: - coverage.write(gene.name+"\t"+str(float(gene.getBaseCoverage())/float(gene.size()))+"\t"+str(gene.size())+"\t"+str(gene.getBaseCoverage())) - if ( float(gene.getBaseCoverage())/float(gene.size()) > 0.8 ): - for exon in gene.exons: - exonsInWellCoveredGenes.append(exon) - coverage.write("\n") - except ZeroDivisionError: - continue - counter = 1 + counter - if ( counter % 5000 == 0 ): - print("Written: "+str(counter)+" genes.") -coverage.close() -print("Sorting exons by start location...") -exons.sort(intervalCompareExon) -exonsInWellCoveredGenes.sort(intervalCompareExon) -print("Writing all exon targets...") -exon_targets = open("exonTargets.interval_list",'w') -exon_targets.write("".join(headerlines)) -for exon in exons: - exon_targets.write(exon.getBedEntry()+"\n") -exon_targets.close() -print("Writing exons for well-covered genes...") -good_exon_targets = open("exonTargets_well_covered_genes.interval_list",'w') -good_exon_targets.write("".join(headerlines)) -for exon in exonsInWellCoveredGenes: - good_exon_targets.write(exon.getBedEntry()+"\n") -good_exon_targets.close() -print("Writing all exons and their overlapping targets...") -debug = open("overlapping_targets.txt",'w') -for exon in exons: - debug.write(exon.id+"\t"+str(exon.getCoverageProportion())+"\t"+str(exon.interval)+"\t") - intervals = list() - for inter in exon.getOverlappingIntervals(): - intervals.append(str(inter)) - debug.write("\t".join(intervals)+"\n") diff --git a/python/igvController.py b/python/igvController.py deleted file mode 100755 index 8c23b9908..000000000 --- a/python/igvController.py +++ /dev/null @@ -1,87 +0,0 @@ -# Echo server program -import socket, re -from os.path import join -from time import sleep - -from farm_commands2 import * -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string -import time -from madPipelineUtils import * - -HOST = 'vm0e0-052.broadinstitute.org' # Symbolic name meaning the local host -PORT = 60151 # Arbitrary non-privileged port -LOCAL_DIR = "/Users/depristo/Desktop/IGV_screenshots" -SLEEP_TIME = 1 - -def main(): - global OPTIONS - usage = """usage: %prog [options] sites -Automatically captures IGV PNG screenshots at each site in the file sites (of the form chrX:start-stop or chrX:pos or a VCF file) by connecting to -an IGV session. See http://www.broadinstitute.org/igv/?q=PortCommands. Make sure you enable ports in the IGV preferences""" - parser = OptionParser(usage=usage) - parser.add_option("-w", "--wait", dest="wait", - action='store_true', default=False, - help="If provided, instead of taking screenshots we will prompt the user on the command line to press return and then jump to the new location") - parser.add_option("", "--host", dest="host", - type='string', default=HOST, - help="The host running the port enabled IGV server") - parser.add_option("-d", "--dir", dest="dir", - type='string', default=LOCAL_DIR, - help="The local directory on the host machine that screenshots should be written to") - parser.add_option("-s", "--sort", dest="sort", - type='string', default="base", - help="The sort order for the bases, currently can be one of base, position, strand, quality, sample, and readGroup.") - parser.add_option("-p", "--prefix", dest="prefix", - type='string', default="", - help="A prefix to add before the contig name before sending the command to IGV. Useful for dealing with b36 -> hg18 issues") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - sites = args[0] - #It might ignore the first line.. - print "Be sure to turn on ports" - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - - def sendCommand(cmd): - s.send(cmd) - print cmd.strip(), '=>', s.recv(512).strip() - - s.connect((OPTIONS.host, PORT)) - sendCommand("snapshotDirectory " + OPTIONS.dir + "\n") - c = 0 - for line in open(sites): - parts = line.split() - - if sites.find(".vcf") != -1 and parts[0][0] != "#": - site = OPTIONS.prefix + ':'.join(parts[0:2]) - else: - site = parts[0] - - print site - c+=1 - sendCommand("goto %s\n" % site) - sendCommand("sort " + OPTIONS.sort + "\n") - if OPTIONS.wait: - raw_input("Enter To Continue") - else: - print 'sleep', SLEEP_TIME, 'secs' - time.sleep(SLEEP_TIME) - sendCommand("snapshot\n") # %s.png\n" % re.sub("-","_",re.sub(':', '_', site))) - print c - - -if __name__ == "__main__": - main() - - diff --git a/python/indelVerboseStats.py b/python/indelVerboseStats.py deleted file mode 100755 index 55b9736c7..000000000 --- a/python/indelVerboseStats.py +++ /dev/null @@ -1,45 +0,0 @@ -from farm_commands2 import * -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string -from madPipelineUtils import * - -def main(): - global OPTIONS - usage = "usage: %prog [options] indels.verbose.txt" - parser = OptionParser(usage=usage) - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - indelsFile = args[0] - - print 'event size nReadsWithIndel nReadsWithoutIndel nReadsTotal' - for line in open(indelsFile): - try: - parts = line.split() - # chr1 30502 30505 -TTT OBS_COUNTS[C/A/T]:5/5/13 AV_MM[C/R]:0.00/0.75 AV_MAPQ[C/R]:37.00/27.13 NQS_MM_RATE[C/R]:0.00/0.01 NQS_AV_QUAL[C/R]:30.90/28.30 STRAND_COUNTS[C/C/R/R]:2/3/6/2 - chr, start, stop = parts[0:3] - event = parts[3] - counts = parts[4] - isInsertion = event[0] == '+' - size = len(event[1:]) - nConsensusReads, nReadsWithIndel, nReads = map(int, counts.split(':')[1].split('/')) - nReadsWithoutIndel = nReads - nReadsWithIndel - print event[0], size, nReadsWithIndel, nReadsWithoutIndel, nReads - except: - continue - -if __name__ == "__main__": - main() diff --git a/python/lsf_post_touch.py b/python/lsf_post_touch.py deleted file mode 100755 index ec0e1b563..000000000 --- a/python/lsf_post_touch.py +++ /dev/null @@ -1,10 +0,0 @@ -import sys -import os -directories = list() -status = os.getenv("LSB_JOBEXIT_STAT") - -for j in range(1,len(sys.argv)) : - directories.append(sys.argv[j]) - -if ( status == "0" or status == 0): - os.system("touch "+" ".join(directories)) diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py deleted file mode 100755 index 2fc78d41e..000000000 --- a/python/madPipelineUtils.py +++ /dev/null @@ -1,130 +0,0 @@ -from farm_commands2 import * -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string - -GATK_STABLE_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar' -GATK_DEV_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar' -GATK_JAR = GATK_STABLE_JAR - -# add to GATK to enable dbSNP aware cleaning -# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod - -hg18 = ['chrM'] + ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY'] -b36 = [str(i) for i in range(1,23)] + ['X', 'Y', 'MT'] - -#hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY'] -#b36 = [str(i) for i in range(1,23)] + ['X', 'Y'] - -HG18_TO_B36 = { - 'hg18' : 'b36', - '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta', - '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai' : '/broad/1KG/reference/human_b36_both.fasta.fai', - 'chrM' : 'MT', - 'chr' : '' } - -def hg18args_to_b36(s): - for key, value in HG18_TO_B36.iteritems(): - s = s.replace(key, value) - return s - -def appendExtension(path, newExt, addExtension = True): - root, basename = os.path.split(path) - basename, ext = os.path.splitext(basename) - print root, basename, ext, path - s = basename + '.' + newExt - if addExtension: s += ext - return os.path.join(root, s) -# return os.path.join(OPTIONS.dir, s) - -class PipelineArgs: - def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '2g', excludeChrs = [] ): - self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO ' - self.ref = ref - self.name = name - self.memory = memory - self.excludeChrs = excludeChrs - - def convertToB36(self): - return self.ref == 'b36' - - def addGATKArg(self, arg): - if len(arg) > 0 and arg[0] != ' ': - arg = ' ' + arg - self.GATK += arg - - def getCommandName(self, suffix): - if self.name != None: - return self.name + "." + suffix - else: - return suffix - - def finalizedGATKCommand(self, args): - cmd = (self.GATK % self.memory) + ' ' + args - if self.convertToB36(): - cmd = hg18args_to_b36(cmd) - return cmd - - def chrsToSplitBy(self, chrs): - #print 'XXX', self.excludeChrs - return filter(lambda x: x not in self.excludeChrs, chrs) - -# -# General features -# -def simpleGATKCommand( pargs, name, args, lastJobs ): - cmd = pargs.finalizedGATKCommand(args) - return [FarmJob(cmd, jobName = pargs.getCommandName(name), dependencies = lastJobs, memlimit = pargs.memory)] - -# -# Takes a simpleGATKCommand and splits it by chromosome, merging output -# -def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ): - if cmd.cmd_str_from_user.find(" -L") != -1: - sys.exit("Found -L argument in command -- cannot be provided for parallelization by chromosome: " + cmd.cmd_str_from_user) - def makeChrCmd(chr): - chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize) - chr_cmd_str = cmd.cmd_str_from_user - chr_cmd_str += ' -L ' + chr - for x, y in zip(outputsToParallelize, chrOutputMap): - chr_cmd_str = chr_cmd_str.replace(x, y) - - if myPipelineArgs.convertToB36(): - chr_cmd_str = hg18args_to_b36(chr_cmd_str) - - chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies, memlimit = myPipelineArgs.memory) - return chrCmd, chrOutputMap - - #print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18) - splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) ) - splitCommands = map(lambda x: x[0], splits) - - def mergeCommand1(i): - mergeCommand = mergeCommands[i] - mergeFile = outputsToParallelize[i] - splitFiles = map(lambda x: x[1][i], splits) - return FarmJob(mergeCommand(splitFiles, mergeFile), jobName = cmd.jobName + '.merge', dependencies = splitCommands, memlimit = myPipelineArgs.memory) - - mergeCommands = map(mergeCommand1, range(len(outputsToParallelize))) - - return splitCommands + mergeCommands - -def mergeVCFs(splitFiles, mergeFile): - splitFilesString = ' '.join(splitFiles) - cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai %s > %s" % (splitFilesString, mergeFile) - return cmd - -def mergeByCat(splitFiles, mergeFile): - splitFilesString = ' '.join(splitFiles) - return "cat %s > %s" % (splitFilesString, mergeFile) - -def indexBAMFile( name, bamFile, lastJobs ): - cmd = 'samtools index ' + bamFile - return [FarmJob(cmd, jobName = 'samtools.index.' + name, dependencies = lastJobs, memlimit = "1g")], None diff --git a/python/makeIndelMask.py b/python/makeIndelMask.py deleted file mode 100755 index 191d0a10d..000000000 --- a/python/makeIndelMask.py +++ /dev/null @@ -1,29 +0,0 @@ -from optparse import OptionParser - -def main(): - global OPTIONS - usage = "usage: %prog [options] indelCalls.bed maskSize indelsMask.bed" - parser = OptionParser(usage=usage) - #parser.add_option("", "--dry", dest="dry", - # action='store_true', default=False, - # help="If provided, nothing actually gets run, just a dry run") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 3: - parser.error("incorrect number of arguments") - - indelCalls, maskSize, indelsMask = args - maskSize = int(maskSize) - - out = open(indelsMask, 'w') - for line in open(indelCalls): - # chr1 71996 72005 -AAAAAAAAA:4/6 - chr, indelStart, indelStop, notes = line.split() - maskStart = int(indelStart) - maskSize - maskStop = int(indelStop) + maskSize - maskNotes = notes + ":+/-" + str(maskSize) - print >> out, '\t'.join([chr, str(maskStart), str(maskStop), maskNotes]) - out.close() - -if __name__ == "__main__": - main() diff --git a/python/makeMetricsFilesForFirehose.py b/python/makeMetricsFilesForFirehose.py deleted file mode 100755 index 623c9d042..000000000 --- a/python/makeMetricsFilesForFirehose.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python - -import sys -import os - -bam_file = sys.argv[1] -fingerprint_file = sys.argv[2] -project = sys.argv[3] -directory = bam_file.rsplit("/",1)[0]+"/" -sample_id = bam_file.rsplit("/",1)[1].rsplit(".",1)[0] -is_metrics = directory+sample_id+".insert_size_metrics" -his_metrics = directory+sample_id+".hybrid_selection_metrics" -ali_metrics = directory+sample_id+".alignment_summary_metrics" -os.system("zip -j "+project+"_"+sample_id+"_sequencing_metrics"+" "+is_metrics+" "+his_metrics+" "+ali_metrics+" "+fingerprint_file) diff --git a/python/memo.py b/python/memo.py deleted file mode 100755 index 561d0e364..000000000 --- a/python/memo.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python - -import time -class timer: - def __init__(self): - self.start() - def start(self): - self.start_time = time.time() - def stop(self): - return time.time() - self.start_time - -class time_func: - def __init__(self, fn): - self._fn = fn - def __call__(self, *args, **keywords): - tmr = timer() - result = self._fn(*args, **keywords) - elapsed = tmr.stop() - print "%.5fs elapsed." % elapsed - return result - -# DiskMemoize caches the results of function calls on disk for later, speedy retrieval -import cPickle, os, sys, math -class DiskMemoize: - """Memoize(fn) - an instance which acts like function fn given at instantiation - but memoizes the returned result of fn according to the function's arguments. - Memoized results are stored on disk - """ - # Supposedly will only work on functions with non-mutable arguments - def __init__(self, fn, fn_name, global_deps): - """fn - name of function to call when object is called as a function - fn_name - string corresponding to fn that is used to name memoization disk file - global_deps - global variables that affect the outcome of the function call provided - as a list of variable names as strings""" - self.fn = fn - self.fn_name = fn_name - self.global_deps = global_deps - #self.memo = {} - def __call__(self, global_deps = [], volatile_keywords = {}, skip_cache=False, verbose=True, *args, **keywords): - """Calls the function assigned at initialization - self.fn - attempting to - use a memoized result if it has been stored on disk. - - The function arguments are (in order of most likely use): - args - non-keyword arguments are DISABLED and will cause an exception to be raised - keywords - keyword arguments provided in standard function call style - volatile_keywords - hash of keywords (var_name, value) pairs to be passed to the - function call that do NOT affect the cached result in a meaningful way (e.g. verbose flag) - skip_cache - skips checking the cache for a memoized result, only memoize a new result - to disk (default: True)""" - - #print "BONUS:",BONUS - #print globals()["BONUS"] - - # Raise an exception if any non-keyword arguments were provided - if len(args) > 0: - raise Exception("DiskMemoize.__call__ encountered non-keyword arguments (args); it can only be called with keyword arguments (keywords)") - - # Pickling filename specifying all dependencies - pkl_filename = self.fn_name+"__" - pkl_filename += "__".join([str(arg)+"."+str(val) for arg,val in keywords.items()] + \ - ["global_"+arg_name+"."+str(globals()[arg_name]) for arg_name in global_deps]) - pkl_filename += ".pkl" - - if os.path.exists(pkl_filename) and not skip_cache: - # Just read the memoized result - pkl_file = open(pkl_filename, "rb") - print "DiskMemoize: Loading result from",pkl_filename, ;sys.stdout.flush() - tmr = timer() - result = cPickle.load(pkl_file) - calc_time = cPickle.load(pkl_file) - elapsed = tmr.stop() - speedup = calc_time/elapsed if elapsed != 0 else 9999.9999 - print ": %.5fs loading, %.5fs original calc. time, %.2f X speedup" % (elapsed, calc_time, speedup) - return result - else: - # Calculate the result and memoize it to disk - pkl_file = open(pkl_filename, "wb") - - print "DiskMemoize: Calculating result for:", - #print self.fn_name+" ("+(", ".join(map(str,args))) + ")" - keywords.update(volatile_keywords) - tmr = timer() - print self.fn_name+" ("+(", ".join(["=".join(map(str, it)) for it in keywords.items()])) + ")", - sys.stdout.flush() - result = self.fn(*args,**keywords) - calc_time = tmr.stop() - print ": %.5fs calculating" % calc_time - - tmr = timer() - print "DiskMemoize: Writing result to",pkl_filename, ;sys.stdout.flush() - cPickle.dump(result, pkl_file, protocol=2) - cPickle.dump(calc_time, pkl_file, protocol=2) - print ": %.5fs writing" % tmr.stop() - - return result - -def primes(first_prime, last_prime, verbose=False): - """Generates lists of prime numbers from first_prime to last_prime. - Used to test DiskMemoize""" - - #if prime < 2: return - primes = range(first_prime, last_prime) - if verbose: - print "I'm being verbose" - print "Prime range is",first_prime,"to",last_prime - last_div = int(math.sqrt(last_prime)+1) - for div in range(2, last_div): # (len(primes)/2.0)+1): - next_primes = [] - for x in primes: - if x % div != 0 or x == div: - #primes[x] = 0 - next_primes.append(x) - #primes.pop(i) - #pop_list.append(i) - primes = next_primes - if "BONUS" in globals(): - primes += [0] * BONUS - return primes * 10000 - -BONUS = 0 - -if __name__ == "__main__": - - print_checks = False - - print "Running DiskMemoize test suite." - primes = DiskMemoize(primes, "primes", global_deps = ['BONUS']) - - prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True, volatile_keywords = {'verbose' : True}) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 20, last_prime = 100) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - print "skip_check = True" - prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 20, last_prime = 100) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - BONUS = 5 - print "BONUS:",BONUS - prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) - if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 20, last_prime = 10000, skip_cache = True) - if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 20, last_prime = 10000) - if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 1, last_prime = 30000, skip_cache = True) - if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - prime_list = primes(first_prime = 1, last_prime = 30000) - if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" - - - - - -#INTJ???? -#INTP???? -#extreme P - -# cookies with oatmeal, coconut, chocolate chip - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# For functions taking mutable arguments, use the cPickle module, as -# in class MemoizeMutable: - -#class MemoizeMutable: -# """Memoize(fn) - an instance which acts like fn but memoizes its arguments -# Will work on functions with mutable arguments (slower than Memoize) -# """ -# def __init__(self, fn): -# self.fn = fn -# self.memo = {} -# def __call__(self, *args): -# import cPickle -# str = cPickle.dumps(args) -# if not self.memo.has_key(str): -# self.memo[str] = self.fn(*args) -# return self.memo[str] diff --git a/python/mergeVCFInfoFields.py b/python/mergeVCFInfoFields.py deleted file mode 100644 index 3c970f3eb..000000000 --- a/python/mergeVCFInfoFields.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python - -import sys - -debug = False - -print("Merging:") - -# open files for reading -vcfInputFiles = [] -vcfOutputFile = "" -for arg in sys.argv: - if( arg.endswith("Fields.py") ): - # do nothing - continue - else: - if ( arg.startswith("I=") ): - input1 = arg.strip().split("I=")[1] - input2 = input1.split(",") - for vcfInput in input2: - print(vcfInput) - vcfInputFiles.append(vcfInput) - elif ( arg.startswith("O=") ): - vcfOutputFile=open(arg.strip().split("O=")[1],'w') - else: - print("Unsupported argument: "+arg) - sys.exit() - -lines = 0 - -# the proper (albeit slower) way to do this is to read the info fields -# into a dict and then iterate through the keys - -infodict = dict() - -for inFile in vcfInputFiles: - # for each file - for line in open(inFile).readlines(): - # for each line - if ( line.startswith("#") ): - # do nothing - continue - else: - spline = line.strip().split() - chrompos = spline[0]+":"+spline[1] - info = spline[7] - if ( chrompos in infodict ): - if ( info == "." ): - # do nothing - continue - else: - curInfo = infodict[chrompos] - if ( curInfo == "." or curInfo == ""): - newInfo = info - else: - # now we need to parse the fields - curinfofields = set(curInfo.split(";")) - newinfofields = set(info.split(";")) - curinfofields.update(newinfofields) - newInfo = ";".join(curinfofields) - - infodict[chrompos] = newInfo - #print(newInfo) - else: - infodict[chrompos] = info - -# dictionary has been constructed; now just iterate through the first vcf -# and update the info field, printing to the output file - -if ( debug ): - for key in infodict: - print(infodict[key]) - -for line in open(vcfInputFiles[0]).readlines(): - line = line.strip() - if ( line.startswith("#") ): - # this is a header - vcfOutputFile.write(line+"\n") - else: - # this is a real line - spline = line.split() - outstring = "" - fieldno = 0 - for field in spline: - #print("fieldno="+str(fieldno)) - if ( fieldno == 7 ): - outstring = outstring+infodict[spline[0]+":"+spline[1]]+"\t" - fieldno = fieldno + 1 - else: - outstring = outstring+field+"\t" - fieldno = fieldno + 1 - # we wrote an extra \t, replace the last one with an \n - outstring = outstring.strip()+"\n" - vcfOutputFile.write(outstring) - - -# and we're done diff --git a/python/parse_pm_input.py b/python/parse_pm_input.py deleted file mode 100644 index d6239aba4..000000000 --- a/python/parse_pm_input.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Generates BAM lists from Excel and TSV files provided by project managers. Suitable for input into the pre-QC metrics generation -# script. -# -# To run: -# /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \ -# -J-classpath $STING_HOME/lib/poi-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-schemas-3.8-beta3.jar:$STING_HOME/lib/xmlbeans-2.3.0.jar:$STING_HOME/lib/dom4j-1.6.1.jar \ -# parse_pm_input.py > -# -from java.io import FileInputStream -from org.apache.poi.ss.usermodel import Row,Sheet,Workbook,WorkbookFactory - -import os,sys - -base_path = '/seq/picard_aggregation/%s/%s' - -def excel_reader(filename): - wb = WorkbookFactory.create(FileInputStream(filename)); - for sheet_number in range(wb.getNumberOfSheets()): - project_column = None - sample_column = None - - sheet = wb.getSheetAt(sheet_number); - - for cell in sheet.getRow(0): - column_index = cell.getColumnIndex() - column_contents = cell.getStringCellValue() - if column_contents == 'Project': - project_column = column_index - if column_contents == 'External ID' or column_contents == 'Individual ID': - sample_column = column_index - - if project_column != None and sample_column != None: - for row_number in range(1,sheet.getLastRowNum()+1): - project = sheet.getRow(row_number).getCell(project_column).getStringCellValue() - sample = sheet.getRow(row_number).getCell(sample_column).getStringCellValue() - yield project,sample - return - -def tsv_reader(filename): - f = open(filename,'rU') - for line in f: - tokens =line.split('\t') - project = tokens[0].strip() - sample = tokens[1].strip() - yield project,sample - f.close() - -def create_reader(filename): - extension = os.path.splitext(filename)[1] - if extension == '.xls' or extension == '.xlsx': - return excel_reader(filename) - elif extension == '.tsv' or extension == '.txt': - return tsv_reader(filename) - else: - print 'Unrecognized file extension',extension - sys.exit(1) - -if len(sys.argv) != 2: - print 'USAGE: %s ' - sys.exit(1) -if not os.path.exists(sys.argv[1]): - print 'Input file %s not found' % sys.argv[1] - sys.exit(1) - -input_filename = sys.argv[1] - -for project,sample in create_reader(input_filename): - sample_path = base_path % (project,sample) - versions = [] - for version_path in os.listdir(sample_path): - if version_path[0] != 'v': - print 'Hit a path name that cannot be parsed: ',version_path - sys.exit(1) - versions.append(int(version_path[1:])) - if len(versions) == 0: - continue - versions = sorted(versions) - bam_file = '%s/v%d/%s.bam' % (sample_path,versions[-1],sample) - if not os.path.exists(bam_file): - print 'Malformed file: tried to find %s, but no such path exists' % bam_file - sys.exit(1) - print bam_file diff --git a/python/picard_utils.py b/python/picard_utils.py deleted file mode 100755 index 441c06936..000000000 --- a/python/picard_utils.py +++ /dev/null @@ -1,325 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -import string -import re -import glob -import unittest -import itertools - -#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"] -#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637'] - -lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"] -idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"] -ids = dict(zip(lanes, idsList)) -gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" -ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" -analysis = "CombineDuplicates" - -MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar' -FIXMATES_BIN = '/seq/software/picard/current/bin/FixMateInformation.jar' -SAMTOOLS_MERGE_BIN = '/seq/dirseq/samtools/current/samtools merge' -CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar' - -def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod): - return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod) - -def unique(l): - return list(set(l)) - -def genotypes2heterozygosity(genotypes, nIndividuals = -1): - def isHET(genotype): - return genotype[0] <> genotype[1] - - if nIndividuals == -1: - n = len(genotypes) - else: - n = nIndividuals - - hets = filter( isHET, genotypes ) - nhets = len(hets) - # print genotypes, ' => hets', hets - return [nhets / (1.0*n), nhets, n] - -def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1): - if nIndividuals == -1: - n = len(genotypes) - else: - n = nIndividuals - - alleles = ''.join(genotypes) - nChroms = 2 * n - nCalledChroms = 2 * len(genotypes) - nMissingChroms = nChroms - nCalledChroms - nRefChroms = alleles.count(ref) + nMissingChroms - nAltChroms = nChroms - nRefChroms - p = float(nRefChroms) / nChroms - q = float(nAltChroms) / nChroms - - #print 'genotypes', genotypes - #print 'alleles', alleles - #print 'nChroms', nChroms - #print 'nCalledChroms', nCalledChroms - #print 'nMissingChroms', nMissingChroms - #print 'nRefChroms', nRefChroms - #print 'nAltChroms', nAltChroms - #print 'p, q', p, q - - assert p + q == 1 - - return [p, q, n] - -class PicardSNP: - def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes, sources): - self.loc = loc - self.ref = ref - self.polymorphism = polymorphism - self.heterozygosity = heterozygosity - self.nIndividuals = allelefrequencies[2] - self.allelefrequencies = allelefrequencies - self.genotypes = genotypes - self.sources = sources - - def refGenotype(self): - return self.ref + self.ref - - def hetGenotype(self): - return self.ref + self.alt() - - def homVarGenotype(self): - return self.alt() + self.alt() - - def alt(self): - return self.polymorphism[1] - - def het(self): - return self.heterozygosity[0] - - def p(self): - return self.allelefrequencies[0] - - def q(self): - return self.allelefrequencies[1] - - def countGenotype(self, genotype): - r = len(filter( lambda x: sorted(x) == sorted(genotype), self.genotypes )) - #print 'countGenotype', genotype, self.genotypes, r - return r - - def nRefGenotypes(self): - return self.nIndividuals - self.nHetGenotypes() - self.nHomVarGenotypes() - - def nHetGenotypes(self): - return self.countGenotype(self.hetGenotype()) - - def nHomVarGenotypes(self): - return self.countGenotype(self.homVarGenotype()) - - def __str__(self): - return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes)) - -def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ): - #print 'geliCallsAtSite', geliCallsAtSite - loc = geliCallsAtSite[0] - #print loc - refBases = map( lambda call: call[2], geliCallsAtSite[1] ) - refBase = refBases[0] - #print 'refBases', refBases - - genotypes = map( lambda call: ''.join(sorted(call[5])), geliCallsAtSite[1] ) - allBases = unique(''.join(genotypes) + refBase) - #print 'All bases => ', allBases, genotypes - - if len(allBases) > 2: - print '*** WARNING, tri-state allele [ref=%s, all bases observed = %s] discovered at %s, ignoring the call' % ( refBase, ''.join(allBases), loc ) - return None - - #print 'genotypes', genotypes - polymorphism = unique(list(refBase + genotypes[0])) - if polymorphism[0] <> refBase: polymorphism.reverse() - - #print 'polymorphism', polymorphism - genotype = list(geliCallsAtSite[1][0][5]) - - return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes, []) - - #return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite)) - -def call2loc(call): - return call[0] + ':' + call[1] - -def aggregateGeliCalls( sortedGeliCalls ): - #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] - return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] - -def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ): - if useSamtools: - return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles) - else: - # use picard - if type(inputFiles) <> list: - inputFiles = list(inputFiles) - - MSDStr = '' - if MSD: MSDStr = 'MSD=true' - - return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) - -def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ): - if type(inputFiles) <> list: - inputFiles = list(inputFiles) - - return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) ) - -def getPicardPath(lane, picardRoot = '/seq/picard/'): - flowcell, laneNo = lane.split('.') - filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*') - dirs = glob.glob(filePat) - print dirs - if len(dirs) > 1: - system.exit("Bad lane -- too many directories matching pattern " + filePat) - return dirs[0] - -def getReferenceGenotypeFileFromConcordanceFile(concordFile): - # REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli - p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)') - for line in open(concordFile): - match = p.search(line) - print 'Match is', line, match - if match <> None: - return match.group(1) - return None - -def hybridSelectionExtraArgsForCalling(): - return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true" - -def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''): - return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options) - -def concord(options, geli, output, genotypeFile): - return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) ) - -def readPicardConcordance(file): - p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE') -# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE -# HOMOZYGOUS_REFERENCE 853 853 0 1 -# HETEROZYGOUS 416 413 3 0.992788 -# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979 - types = [str, int, int, int, float] - def parse1(line): - return [f(x) for f, x in zip(types, line.split())] - data = [parse1(line) for line in open(file) if p.match(line) <> None] - return data - -def splitPath(geli): - root, filename = os.path.split(geli) - s = filename.split('.') - flowcellDotlane = '.'.join(s[0:2]) - ext = '.'.join(s[2:]) - return [root, flowcellDotlane, ext] - -def read_dbsnp(dbsnp_matches): - next = False - for line in open(dbsnp_matches): - s = line.split() - if next: - return s - if len(s) > 0 and s[0] == "TOTAL_SNPS": - next = True - return [] - -# ------------------------------------------------------------------------------------------ -# -# Unit testing! -# -# ------------------------------------------------------------------------------------------ -class TestPicardUnils(unittest.TestCase): - def setUp(self): - import cStringIO - dataString = """chr1 1105366 T 52 99 CT 10.559975 10.559975 -117.68 -93.107178 -116.616493 -45.536842 -88.591728 -92.043671 -20.964022 -116.014435 -44.473339 -31.523996 -chr1 1105411 G 22 99 AG 12.484722 12.484722 -23.995817 -27.909206 -10.875731 -27.909206 -46.579994 -29.546518 -46.579994 -23.360453 -29.546518 -46.579994 -chr1 1105411 G 29 99 AG 12.033216 12.033216 -30.641142 -34.376297 -14.483982 -35.457623 -53.197636 -32.525024 -53.498665 -26.517199 -33.606354 -54.579994 -chr1 1105857 G 6 99 AG 7.442399 2.096584 -7.55462 -9.3608 -5.458036 -9.3608 -20.279993 -16.37723 -20.279993 -12.900434 -16.37723 -20.279993 -chr1 1105857 G 7 99 AG 10.889011 1.795554 -7.406977 -9.514187 -5.611423 -9.514187 -23.879993 -19.97723 -23.879993 -16.500435 -19.97723 -23.879993 -chr1 1106094 T 20 99 CT 6.747106 6.747106 -56.979992 -43.143734 -56.806652 -23.699236 -41.036522 -42.97039 -9.862975 -56.505623 -23.525892 -16.610081 -chr1 1110294 G 42 99 AG 21.076984 21.076984 -44.285015 -49.702267 -17.869579 -49.831242 -80.276649 -48.442669 -80.404335 -38.946564 -48.571644 -80.405624 -chr1 1111204 C 26 99 CT 11.364679 11.364679 -55.479992 -31.040928 -55.479992 -36.424099 -23.349712 -31.040928 -11.985033 -55.479992 -36.424099 -32.811741 -chr1 1111204 C 29 99 TT 34.740601 3.890135 -52.282055 -48.646442 -52.704597 -17.954794 -44.565525 -48.464859 -13.715057 -52.100475 -17.773212 -9.824923 -chr1 1111204 C 31 99 CT 18.784479 18.784479 -71.079994 -39.870823 -71.079994 -44.303268 -31.878578 -39.870823 -13.094099 -71.079994 -44.303268 -39.48679 -""" - dataFile = cStringIO.StringIO(dataString) - self.nIndividuals = 10 - - self.genotypesSets = aggregateGeliCalls(map( string.split, dataFile.readlines() ) ) - self.genotypes = map(lambda x: aggregatedGeliCalls2SNP(x, self.nIndividuals), self.genotypesSets ) - self.locs = ["chr1:1105366", "chr1:1105411", "chr1:1105857", "chr1:1106094", "chr1:1110294", "chr1:1111204"] - self.nhets = [1, 2, 2, 1, 1, 2] - self.altAlleles = [1, 2, 2, 1, 1, 4] - - self.aaf = map( lambda x: (1.0*x) / (2 * self.nIndividuals), self.altAlleles ) - self.hets = map( lambda x: (1.0*x) / self.nIndividuals, self.nhets ) - - def testGenotypesSize(self): - self.assertEqual(len(self.genotypesSets), 6) - - def testGenotypes2Het(self): - print 'testGenotypes2Het...' - self.assertEqual(genotypes2heterozygosity(['AT']), [1, 1, 1]) - self.assertEqual(genotypes2heterozygosity(['AA']), [0, 0, 1]) - self.assertEqual(genotypes2heterozygosity(['TT']), [0, 0, 1]) - self.assertEqual(genotypes2heterozygosity(['AT', 'AT']), [1, 2, 2]) - self.assertEqual(genotypes2heterozygosity(['AA', 'AA']), [0, 0, 2]) - self.assertEqual(genotypes2heterozygosity(['AT', 'AA']), [0.5, 1, 2]) - self.assertEqual(genotypes2heterozygosity(['AT', 'TT']), [0.5, 1, 2]) - self.assertEqual(genotypes2heterozygosity(['AT', 'TT', 'AA']), [1.0/3, 1, 3]) - self.assertEqual(genotypes2heterozygosity(['AT', 'AT', 'AA']), [2.0/3, 2, 3]) - - self.assertEqual(genotypes2heterozygosity(['AT', 'AT'], 10), [2.0/10, 2, 10]) - self.assertEqual(genotypes2heterozygosity(['AT', 'AA'], 10), [1.0/10, 1, 10]) - - def testAlleleFreqs(self): - print 'testAlleleFreqs...' - self.assertEqual(genotypes2allelefrequencies('A', ['AT']), [0.5, 0.5, 1]) - self.assertEqual(genotypes2allelefrequencies('T', ['AT']), [0.5, 0.5, 1]) - self.assertEqual(genotypes2allelefrequencies('A', ['AA']), [1.0, 0.0, 1]) - self.assertEqual(genotypes2allelefrequencies('A', ['TT']), [0.0, 1.0, 1]) - - self.assertEqual(genotypes2allelefrequencies('A', ['TT'], 2), [0.5, 0.5, 2]) - self.assertEqual(genotypes2allelefrequencies('A', ['AA'], 2), [1.0, 0.0, 2]) - self.assertEqual(genotypes2allelefrequencies('A', ['AT'], 2), [3.0/4, 1.0/4, 2]) - self.assertEqual(genotypes2allelefrequencies('A', ['AT'], 3), [5.0/6, 1.0/6, 3]) - self.assertEqual(genotypes2allelefrequencies('T', ['AT'], 3), [5.0/6, 1.0/6, 3]) - - self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'AT'], 3), [4.0/6, 2.0/6, 3]) - self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'TT'], 3), [3.0/6, 3.0/6, 3]) - self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'TT', 'AA']), [3.0/6, 3.0/6, 3]) - - def testGenotypeSetLocs(self): - for set, loc in zip(self.genotypesSets, self.locs): - #print loc, set - self.assertEqual(set[0], loc) - - def testGenotypeLocs(self): - for genotype, loc in zip(self.genotypes, self.locs): - self.assertEqual(genotype.loc, loc) - - def testGenotypeHets(self): - print 'testGenotypeHets:' - for genotype, het in zip(self.genotypes, self.hets): - print ' => ', genotype, het - self.assertEqual(genotype.het(), het) - - def testGenotypeAlleleFreqs(self): - print 'testGenotypeAlleleFreqs:' - for genotype, af in zip(self.genotypes, self.aaf): - print ' => ', genotype, af - self.assertEqual(genotype.allelefrequencies, [1 - af, af, self.nIndividuals]) - - def testSplit(self): - self.assertEqual(splitPath('/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/1/Solexa-8267/30GA9AAXX.1.observed_genotypes.geli'), ['/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/1/Solexa-8267', '30GA9AAXX.1', 'observed_genotypes.geli']) - self.assertEqual(splitPath('/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/2/Solexa-8268/30GA9AAXX.2.observed_genotypes.geli'), ['/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/2/Solexa-8268', '30GA9AAXX.2', 'observed_genotypes.geli']) - -if __name__ == '__main__': - unittest.main() diff --git a/python/privateMutationRates.py b/python/privateMutationRates.py deleted file mode 100755 index 6bdaf5401..000000000 --- a/python/privateMutationRates.py +++ /dev/null @@ -1,141 +0,0 @@ -import sys -from optparse import OptionParser -from itertools import * -import random - -# a simple script that does: -# 1 -- generates a master set of variants following the neutral expectation from a single big population -# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants -# 3 -- writes out the genotypes of these individuals, and their allele frequency -def main(): - global OPTIONS - usage = "usage: %prog [options] outputFile" - parser = OptionParser(usage=usage) - - parser.add_option("-N", "", dest="bigPopSize", - type='int', default=1000, - help="") - - parser.add_option("-M", "", dest="smallPopSize", - type='int', default=100, - help="") - - parser.add_option("-K", "", dest="nHetsPerSample", - type='int', default=1000, - help="") - - parser.add_option("", "--maxMAF", dest="maxMAF", - type='float', default=None, - help="") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("Takes no arguments") - - random.seed(10000) - genotypes = simulateSeqExpt(OPTIONS.bigPopSize, OPTIONS.smallPopSize, OPTIONS.nHetsPerSample) - printGenotypes(genotypes, open(args[0] + ".genotypes", 'w')) - printAFS(genotypes, open(args[0] + ".afs", 'w')) - -class Variant: - def __init__(self, id, trueAC, trueAN): - self.id = "%d.%d" % ( trueAC, id ) - self.trueAC = trueAC - self.trueAN = trueAN - - q = self.af() - p = 1 - q - self.hw = [p * p, 2 * p * q, q * q] - - def __str__(self): - return "[V %s ac=%d an=%d af=%.2f]" % (self.id, self.trueAC, self.trueAN, self.af()) - __repr__ = __str__ - - def af(self): - return self.trueAC / (1.0*self.trueAN) - - def hwe(self): # returns phomref, phet, phomvar - return self.hw - -def simulateSeqExpt(bigPopSize, smallPopSize, nHetsPerSample): - """Master runner function""" - trueAFS = makeAFS(bigPopSize, nHetsPerSample) - - variants = AFStoVariants(trueAFS, bigPopSize) - - # returns a list of variants per sample - genotypes = genotypeSamples(variants, smallPopSize) - - return genotypes - -def makeAFS(nSamples, nHetsPerSample): - """Generates allele frequency spectrum counts for nsamples and nHetsPerSample from neutral expectation""" - nTotalVariants = nHetsPerSample * sum([1 / (1.0*i) for i in range(1, nSamples * 2 + 1)]) - AFSCounts = [int(round(nHetsPerSample / (1.0*i))) for i in range(1, nSamples * 2 + 1)] - print AFSCounts - print nTotalVariants - print sum(AFSCounts) - return AFSCounts - -def AFStoVariants(trueAFS, bigPopSize): - """Converts an allele frequency spectrum to specific named Variant objects""" - variants = [] - - nChromosomes = 2 * bigPopSize - for ac in range(len(trueAFS)): - af = (1.0*ac) / nChromosomes - if OPTIONS.maxMAF == None or af <= OPTIONS.maxMAF: - for j in range(trueAFS[ac]): - v = Variant(j, ac+1, nChromosomes) - #print ac, j, v - variants.append(v) - else: - print 'Skipping AC', ac, ' / ', nChromosomes, 'beyond max MAF', OPTIONS.maxMAF - - return variants - -# returns a list of variants per sample -def genotypeSamples(variants, nSamples): - """Given a list of variants, generates nSamples genotypes""" - return [genotypeSample(samplei, variants) for samplei in range(nSamples)] - -def genotypeSample(id, variants): - """Generate a single set of genotypes for a single using the list of variants""" - print 'Genotyping sample', id - genotypes = [] - for v in variants: - pHomRef, pHet, pHomVar = v.hwe() - r = random.random() - if r > pHomRef: # are we not reference? - if r > pHomRef + pHet: # are we hom var? - count = 2 - else: - count = 1 - #print (r, v.af(), pHomRef, pHet, pHomVar, count) - genotypes.append([v, count]) - - return genotypes - -def printGenotypes(sampleGenotypes, out): - print >> out, "\t".join(["sample", "id", "ac", "an", "g"]) - for sample, i in izip(sampleGenotypes, count(len(sampleGenotypes))): - for v, g in sample: - print >> out, "\t".join(map(str, [i-1, v.id, v.trueAC, v.trueAN, g])) - -def printAFS(sampleGenotypes, out): - print >> out, "\t".join(["id", "true.ac", "true.an", "true.af", "small.ac", "small.an", "small.af"]) - counts = dict() - - smallAN = len(sampleGenotypes) * 2 - for sample in sampleGenotypes: - for v, g in sample: - if v not in counts: counts[v] = 0 - counts[v] = counts[v] + g - - - for v, smallAC in counts.iteritems(): - print >> out, "\t".join(map(str, [v.id, v.trueAC, v.trueAN, v.af(), smallAC, smallAN, smallAC / (1.0*smallAN)])) - - -if __name__ == "__main__": - main() diff --git a/python/pushback_file.py b/python/pushback_file.py deleted file mode 100644 index 7b1914265..000000000 --- a/python/pushback_file.py +++ /dev/null @@ -1,17 +0,0 @@ -class pushback_file(file): - """Opens a file using the standard file interface adding the ability -to pushback or unread some section of the file that was read from the file.""" - - def __init__(self, fname, mode='r', bufsize=0): - file.__init__(self, fname, mode, bufsize) - self.pushed_back = [] - - def next(self): - if len(self.pushed_back): - return self.pushed_back.pop() - else: - return file.next(self) - - def pushback(self, item): - """Put some item (bytes, line, etc.) back on the \"front\" of the file""" - self.pushed_back.append(item) diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py deleted file mode 100755 index 42ae65f48..000000000 --- a/python/realignBamByChr.py +++ /dev/null @@ -1,134 +0,0 @@ -from farm_commands2 import * -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string -import picard_utils -from madPipelineUtils import * - -def getContigs(optionContigs, hg18): - if optionContigs != None: - return optionContigs.split(",") - else: - return hg18 - -def main(): - global OPTIONS - usage = "usage: %prog [options] stages input.bam outputRoot" - parser = OptionParser(usage=usage) - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("", "--b36", dest="useB36", - action='store_true', default=False, - help="If provided, BAM is assumed to aligned to b36 named chromosomes") - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="") - parser.add_option("-n", "--name", dest="name", - type="string", default="realignBamByChr", - help="Farm queue to send processing jobs to") - parser.add_option("-c", "--contigs", dest="contigs", - type="string", default=None, - help="Comma-separated list of contig:start-stop values to pass to the cleaner. Overrides whole-genome if provided") - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-e", "--extraArgs", dest="extraArgs", - type="string", default=None, - help="") - parser.add_option("", "--dev", dest="dev", - type='string', default="/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar", - help="If provided, we'll use the GATK dev build") - (OPTIONS, args) = parser.parse_args() - if len(args) != 3: - parser.error("incorrect number of arguments") - - stages = map(string.lower, args[0].split(",")) - inputBam, outputRoot = args[1:] - outputBamList = outputRoot + '.bams.list' - - STAGES = ['targets', 'realign', 'index', 'merge'] - for stage in stages: - if stage not in STAGES: - sys.exit('unknown stage ' + stage) - - myPipelineArgs = PipelineArgs(name = OPTIONS.name, GATK_JAR = OPTIONS.dev) - if ( OPTIONS.useB36 ): - myPipelineArgs.ref = 'b36' - - allJobs = [] - - def includeStage(name): - return name in stages - - #out = open(outputBamList, 'w') - realignInfo = [] - for chr in getContigs(OPTIONS.contigs, hg18): - lastJobs = None - - def updateNewJobs(newjobs, lastJobs): - if OPTIONS.verbose: - print 'New jobs', newjobs - allJobs.append(newjobs) - if newjobs != []: - lastJobs = newjobs - return lastJobs - - def execStage(name, func, args = [], lastJobs = []): - if OPTIONS.verbose: print 'Name is', name - newJobs, results = func(myPipelineArgs, chr, inputBam, outputRoot + '.' + chr, args, lastJobs) - if includeStage(name): lastJobs = updateNewJobs(newJobs, lastJobs) - return lastJobs, results - - lastJobs, intervals = execStage('targets', createTargets) - realignJobs, realignedBam = execStage('realign', realign, intervals, lastJobs) - realignInfo.append([realignJobs, realignedBam]) - # need to merge and then index - indexJobs, ignore = execStage('index', index, realignedBam, realignJobs) - #print >> out, os.path.abspath(realignedBam) - - #out.close() - - if 'merge' in stages: - realignerJobs = [] - if realignInfo[0][0] != []: - realignerJobs = map(lambda x: x[0][0], realignInfo) - mergedBam = outputRoot + ".bam" - mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs) - indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob) - allJobs.append(mergerJob) - allJobs.append(indexJob) - - print 'EXECUTING JOBS' - executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - -def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ): - outputIntervals = outputRoot + ".intervals" - GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr) - return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals - -def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ): - outputBAM = outputRoot + ".bam" - GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr) - return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM - -def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): - return indexBAMFile( myPipelineArgs.name, realignedBam, lastJobs ) - -def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ): - print lastJobs - #cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 ) - cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5) - jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs) - #jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 ) - return jobs1 - -if __name__ == "__main__": - main() diff --git a/python/recalAssociation.py b/python/recalAssociation.py deleted file mode 100644 index ac3bcf4ee..000000000 --- a/python/recalAssociation.py +++ /dev/null @@ -1,90 +0,0 @@ -from optparse import OptionParser -from math import log10 -from math import floor - -def foreach( appl, col ): - for z in col: - appl(z) - -def parseWiggle(line): - return int(line.strip()) - -def parseTDF(line): - return int(line.split("Q: ")[1].strip()) - -def recalibrateJointly(files,JECDF): - print("Not implemented.") - -def recalibrate(fname,hist,out): - print(fname) - output = open(out + "." + fname.rsplit(".",1)[0].rsplit(".",1)[1], 'w') - denom = sum(hist.values()) - cumQuals = dict() - for key1 in hist: - sumMore = 0 - for key2 in hist: - if ( key2 >= key1 ): - sumMore += hist[key2] - cumQuals[key1] = min(int(-10*log10((0.0+sumMore)/denom)),150) - use = parseTDF - if ( fname.endswith(".wig") ): - use = parseWiggle - inFile = open(fname) - - if ( fname.endswith(".wig") ): - output.write(inFile.readline()) - - def rewrite(line,val): - if ( line.find("Q: ") == -1 ): - return str(val)+"\n" - else: - spline = line.split("Q: ") - bef = spline[0] - af = spline[1] - afSp = af.split("\t") - afSp[0] = str(val) - return bef + "\t".join(afSp) - - foreach( lambda u: output.write(rewrite(u,cumQuals[use(u)])), inFile.readlines() ) - -def run(opt,arg): - print(opt) - print(arg) - files = map(lambda u: open(u), opt.in_files) - fn = dict(zip(files,opt.in_files)) - foreach( lambda u: u[1].readline(), filter( lambda u: u[0].endswith('.wig'), zip(opt.in_files,files))) - isWig = map(lambda u: u.endswith('.wig'), opt.in_files) - isWig = dict(zip(files,isWig)) - marginalCumulativeDists = dict(map( lambda u: [u,dict()], opt.in_files)) - - def addVal(f,v): - d = marginalCumulativeDists[fn[f]] - if ( v not in d ): - d[v] = 0 - d[v] += 1 - - def parseLine(fl,line): - if ( isWig[fl] ): - return parseWiggle(line) - else: - return parseTDF(line) - - foreach( lambda f: foreach( lambda v: addVal(f,v), map(lambda u: parseLine(f,u), f.readlines() )), files ) - foreach( lambda f: f.close(), files ) - #print(marginalCumulativeDists) - if ( opt.joint ): - recalibrateJointly(opt.in_files,marginalCumulativeDists) - else: - foreach( lambda s: recalibrate(s,marginalCumulativeDists[s],opt.out), opt.in_files ) - -def main(): - usage = "usage: %prog [options] arg" - parser = OptionParser(usage) - parser.add_option("-I","--in",dest="in_files",help="file(s) to recalibrate",action="append") - parser.add_option("-J","--recalibrateJointly",dest="joint",action="store_true",help="Recalibrate quality scores jointly across files, rather than for each independently. Assumes lines match up exactly.") - parser.add_option("-O","--output",dest="out",action="store",help="The base name for the recalibrated output files") - (options,args) = parser.parse_args() - run(options,args) - -if __name__ == "__main__": - main() diff --git a/python/sample_lister.py b/python/sample_lister.py deleted file mode 100755 index 03a5054d1..000000000 --- a/python/sample_lister.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import subprocess - -#this will create a list of collaborator ids or a list of sample names--it will find cleaned bams even if they're not stored in the sample_set file system. I'm mainly using it as a module import. -class SampleSet: - def __init__(self, projectname, setname, path): - self.setname = setname - self.projectname = projectname - self.path = path - def sampslist(self): - '''finds and lists all samples in a set''' - try: - searchpath="ls /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample_Set/" + self.setname +"/ -I CleanBam -I UnifiedGenotyper -I MergeBam" - raw_samps= subprocess.Popen([searchpath], shell=True, stdout=subprocess.PIPE).communicate()[0] - except IOError: - print( "Can't make sample list. Those files are not where they ought to be, or Sample_Set is not valid.") - samps = raw_samps.split("\n" + self.projectname+ "_") - samplelist = raw_samps.split("\n")[0:len(samps)] - samps[0] = samps[0].split(self.projectname+"_")[len(samps[0].split(self.projectname+"_"))-1] - samps[len(samps)-1] = samps[len(samps)-1].split("\n")[0] - return [samps, samplelist] - def bamlist(self, samplist, write=True): - '''finds and lists all cleaned bams in a sample set''' - if (write == True): - try: - if os.path.exists(self.path + "bamsfor" + self.setname + ".list"): - os.remove(self.path + "bamsfor" + self.setname + ".list") - listfile = open(self.path + "bamsfor" + self.setname + ".list", "a") - for samp in samplist: - searcher="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample/" + repr(samp) +"/ -name \*cleaned.bam" - raw_samp= subprocess.Popen([searcher], shell=True, stdout=subprocess.PIPE).communicate()[0] - listfile.write(raw_samp) - listfile.close() - print (listfile.name) - except IOError: - print( "can't make .bam list.Those files are not where they ought to be, or Sample_Set is not valid") - else: - for samp in samplist: - searcher="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample/" + samp +"/ -name \*cleaned.bam" - raw_samp= subprocess.Popen([searcher], shell=True, stdout=subprocess.PIPE).communicate()[0] - print(raw_samp) - def bedlist(self, samplist, write=True): - '''finds and lists all beds for a sample set''' - if (write == True): - try: - if os.path.exists(self.path + "bedsfor" + self.setname + ".list"): - os.remove(self.path + "bedsfor" + self.setname + ".list") - listfile = open(self.path + "bedsfor" + self.setname + ".list", "a") - for samp in samplist: - searcher="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample/" + repr(samp) +"/ -name \*.bed" - raw_samp= subprocess.Popen([searcher], shell=True, stdout=subprocess.PIPE).communicate()[0] - listfile.write(raw_samp) - listfile.close() - print (listfile.name) - except IOError: - print( "can't make .bed list.Those files are not where they ought to be, or Sample_Set is not valid") - else: - for samp in samplist: - searcher="find /humgen/gsa-firehose/firehose/firehose_output/trunk/Sample/" + samp +"/ -name \*.bed" - raw_samp= subprocess.Popen([searcher], shell=True, stdout=subprocess.PIPE).communicate()[0] - print(raw_samp) -'''next two lines are example usage -#pfizer5=SampleSet("T2D_Altshuler_Pfizer_Plate_5", "T2D_Altshuler_Pfizer", "humgen/gsa-hphome1/corin/oneoffs/pfizer5/") -#pfizer5.bamlist(pfizer5.sampslist()[0], write=False)''' diff --git a/python/setFilterGenotypesToRef.py b/python/setFilterGenotypesToRef.py deleted file mode 100644 index e8944aa61..000000000 --- a/python/setFilterGenotypesToRef.py +++ /dev/null @@ -1,16 +0,0 @@ -import sys -print("Fixing "+sys.argv[1]+" to "+sys.argv[2]) -bad_vcf = open(sys.argv[1]) -out_vcf = open(sys.argv[2],'w') - -for line in bad_vcf.readlines(): - if ( line.startswith("#") ): - out_vcf.write(line) - else: - spline = line.strip().split("\t") - newspline = list() - for field in spline: - if ( field.find("pGeno") > -1 ): - field = "0/0:"+field.split(":",1)[1] - newspline.append(field) - out_vcf.write("\t".join(newspline)+"\n") diff --git a/python/splitIntervalsByContig.py b/python/splitIntervalsByContig.py deleted file mode 100644 index 6ad4fee1a..000000000 --- a/python/splitIntervalsByContig.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/python -import sys -input_file = sys.argv[1] -file_index = 1 - -headerLines = list() -intervals = open(input_file) -prevContig = None -outFile = None - -def parseContig(line): - if( line.find("-") > -1 ): ## format is chr:start-stop - return line.split(":")[0] - else: - return line.split("\t")[0] - -for line in open(input_file).readlines(): - if ( line.startswith("@") ): - headerLines.append(line) - else: - thisContig = parseContig(line) - if ( thisContig != prevContig ): - file_index += 1 - try: - newOutFile = open(sys.argv[file_index],'w') - if ( outFile != None): - outFile.close() - outFile = newOutFile - for headerline in headerLines: - outFile.write(headerline) - except IndexError: - print("Error: fewer output files than contigs. Writing remainder to final file.") - prevContig = thisContig - outFile.write(line) diff --git a/python/theoPost.py b/python/theoPost.py deleted file mode 100644 index bb1880a62..000000000 --- a/python/theoPost.py +++ /dev/null @@ -1,193 +0,0 @@ -import math -import sys - -#if ( sys.version_info < (3,0) ): -# raise "Must use python version 3 or later. See /broad/software/free/Linux/redhat_5_x86_64/pkgs/python_3.1.2/bin/python3.1" - -class controls: - def __init__(self,l_x,l_y,c_x,c_y,r_x,r_y): - self.x_a = l_x - 2*c_x + r_x - self.y_a = l_y - 2*c_y + r_y - self.x_b = -2*l_x + 2*c_x - self.y_b = -2*l_y + 2*l_x - self.x_c = l_x - self.y_c = l_y - -class IntegrationCollection: - def __init__(self,start,stop,err,num_ints): - self.start = start - self.stop = stop - self.err = err - self.num_ints = num_ints - -lfmap = dict() -def logfact(a): - global lfmap - if ( a < 2 ): - return 0.0 - if ( a not in lfmap ): - lfmap[a] = math.log10(a) + logfact(a-1) - return lfmap[a] - -def logchoose(a,b): - return logfact(a)-logfact(b)-logfact(a-b) - -def logbinomial(success,trials,prob): - return logchoose(trials,success) + success*math.log10(prob) + (trials-success)*math.log10(1-prob) - -def quad(a,b,c,x): - return a*x*x + b*x + c - -def qformula(a,b,c,equivVal): - return (-b + math.sqrt(b*b-4*a*c))/(2*a) - -def cbezierf(cts,pt): - t = qformula(cts.x_a,cts.x_b,cts.x_c,pt) - y = quad(cts.y_a,cts.y_b,cts.y_c,t) - return y - -bez_cts = controls(-7,10.99919,-2.849154,0.1444735,-0.0043648054,-1.559080) # based on previous gradient descent - - -def simpson(f,ic): - class DeprecationError(Exception): - def __init__(self,val): - self.value = val - def __str__(self): - return repr(self.value) - raise DeprecationError("Simpson is deprecated. Do not use it.") - -def simpAux(f,a,b,eps,s,fa,fb,fc,cap): - if ( s == 0 ): - return [] - c = ( a + b )/2 - h = b-a - d = (a + c)/2 - e = (c + b)/2 - fd = f(d) - fe = f(e) - s_l = (h/12)*(fa + 4*fd + fc) - s_r = (h/12)*(fc + 4*fe + fb) - s_2 = s_l + s_r - if ( cap <= 0 or abs(s_2 - s) <= 15*eps ): - try: - return [math.log10(s_2 + (s_2 - s)/15.0)] - except OverflowError: - print(s_2) - print(s_2-s) - return [-350] - return simpAux(f,a,c,eps/2,s_l,fa,fc,fd,cap-1) + simpAux(f,c,b,eps/2,s_r,fc,fb,fe,cap-1) - -def adaptiveSimpson(f,start,stop,error,cap): - mid = (start + stop)/2 - size = stop - start - fa = f(start) - fb = f(mid) - fc = f(stop) - s = (size/6)*(fa + 4*fc + fb) - h = simpAux(f,start,stop,error,s,fa,fb,fc,int(cap)) - h.sort() - #print("first: "+str(h[0])) - #print("last: "+str(h[len(h)-1])) - return sum(map(lambda x: 10**x,h)) - -def neutral(x): - return -1.0*math.log10(x) - -def twoState(x): - if ( x < 0.04 ): - return -1.5*math.log10(x) + 0.5*math.log10(0.04) - else: - return -1.0*math.log10(x) - -def bezier(x): - return cbezierf(bez_cts,math.log10(x)) - -norm_cache = (None,None) -def resampleProbability(logshape,ic,ac,ns,ac_new,ns_new): - global norm_cache - logpost = lambda x: logshape(x) + logbinomial(ac,2*ns,x) - if ( norm_cache[1] == None or norm_cache[0] != (ac,ns,logshape) ): - print("Caching posterior norm") - norm_cache = ((ac,ns,logshape),math.log10(adaptiveSimpson( lambda v: math.pow(10,logpost(v)), ic.start,ic.stop,ic.err,ic.num_ints))) - logpost_normed = lambda v: logpost(v) - norm_cache[1] - newshape = lambda y: math.pow(10,logpost_normed(y) + logbinomial(ac_new, 2*ns_new, y)) - return adaptiveSimpson(newshape,ic.start,ic.stop,ic.err,ic.num_ints) - -def getPost(logshape,ic,ac,ns): - global norm_cache - logpost = lambda x: logshape(x) + logbinomial(ac,2*ns,x) - if ( norm_cache[1] == None or norm_cache[0] != (ac,ns,logshape) ): - print("Caching posterior norm") - norm_cache = ((ac,ns,logshape),math.log10(adaptiveSimpson(lambda v: math.pow(10,logpost(v)),ic.start,ic.stop,ic.err,ic.num_ints))) - return lambda v: logpost(v) - norm_cache[1] - -sim_ic = IntegrationCollection(5e-8,0.999,1e-2000,22) -sys.setrecursionlimit(int(2e6)) -#neutral_post = map( lambda v: resampleProbability(neutral,sim_ic,1,900,v,900), range(0,21) ) -#twostate_post = list(map( lambda v: resampleProbability(twoState,sim_ic,1,900,v,900), range(0,21) )) -#g = open("n_ts.txt",'w') -#idx = 0 -#for e in neutral_post: -# g.write(str(idx)) -# g.write("\t"+str(e)+"\t"+str(twostate_post[idx])+"\n") -# idx += 1 - -DO_1 = False -if ( DO_1 ): - eomiautism_ac_1 = 317763 - eomiautism_ac_2 = 78844 - eomiautism_ac_3p = 239526 # all of these go on chip by default - - new_set = 10000-917-998 - - num_unseen_sites = 125*new_set - - unseen_unseen = resampleProbability(twoState,sim_ic,0,917+998,0,new_set) - unseen_1 = resampleProbability(twoState,sim_ic,0,917+998,1,new_set)/(1-unseen_unseen) - unseen_2 = resampleProbability(twoState,sim_ic,0,917+998,2,new_set)/(1-unseen_unseen) - ac1_unseen = resampleProbability(twoState,sim_ic,1,917+998,0,new_set) - ac1_ac1 = resampleProbability(twoState,sim_ic,1,917+998,1,new_set) - ac2_unseen = resampleProbability(twoState,sim_ic,2,917+998,0,new_set) - - total = 636133 + num_unseen_sites - ac1 = unseen_1*num_unseen_sites + ac1_unseen*eomiautism_ac_1 - ac2 = unseen_2*num_unseen_sites + ac1_unseen*eomiautism_ac_1 + ac2_unseen*eomiautism_ac_2 - - print("\t".join(map(lambda u: str(u), [unseen_unseen,unseen_1,unseen_2]))) - print("\t".join(map(lambda u: str(u), [total,ac1,ac2]))) - - ea_ns = 343877 - ea_ns_ac1 = 204223 - ea_ns_ac2 = 42280 - - ns_new_ac1 = ea_ns_ac1*ac1_unseen + unseen_1*num_unseen_sites*(1.7/(1+1.7)) - ns_new_ac2 = ea_ns_ac2*ac2_unseen + unseen_2*num_unseen_sites*(1.4/(1+1.4)) - ns_new_total = ea_ns + ns_new_ac1 + ns_new_ac2 + (num_unseen_sites*(1-unseen_1-unseen_2))*(0.6/(1+0.6)) - - print("\t".join(map(lambda u: str(u), [ns_new_total,ns_new_ac1,ns_new_ac2]))) - print(1-resampleProbability(twoState,sim_ic,2,1000,0,10000)-resampleProbability(twoState,sim_ic,2,1000,1,10000)) - print(1-resampleProbability(twoState,sim_ic,1,100,0,2000)-resampleProbability(twoState,sim_ic,1,100,1,2000)) - print(1-resampleProbability(twoState,sim_ic,2,100,0,2000)-resampleProbability(twoState,sim_ic,2,100,1,2000)) - print(1-resampleProbability(twoState,sim_ic,20,1000,0,2000)-resampleProbability(twoState,sim_ic,20,1000,1,2000)) - -def emitPosterior(ac): - post_ac = getPost(twoState,sim_ic,ac,10000) - o = open("post_%d.txt" % ac,'w') - pt = sim_ic.start - while ( pt < 0.2 ): - o.write("%e\t%e\n" % (pt,post_ac(pt))) - pt = 1.015*pt - o.close() - -emitPosterior(2) -emitPosterior(3) -emitPosterior(10) -emitPosterior(25) - -#o = open("test2s.txt",'w') -#pt = sim_ic.start -#while ( pt<0.4 ): -# o.write("%e\t%e\n" % (pt,twoState(pt))) -# pt = 1.015*pt -#o.close() diff --git a/python/ucscRepeatMaskToIntervalList.py b/python/ucscRepeatMaskToIntervalList.py deleted file mode 100755 index bf5c7e29b..000000000 --- a/python/ucscRepeatMaskToIntervalList.py +++ /dev/null @@ -1,146 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil - -IS_OFFSET = True -CHR_OFFSET = 5 -START_OFFSET = 6 -END_OFFSET = 7 -TYPE_OFFSET = 11 - -class RepeatInfo: - def __init__(self, name): - self.name = name - self.count = 0 - self.coveredBases = 0 - -def badChr(excludes, chr): - if excludes == None: - return False - - for exclude in excludes: - if chr.find(exclude) != -1: - return True - return False - -def main(): - global OPTIONS - usage = "usage: %prog stage [options]" - parser = OptionParser(usage=usage) -# parser.add_option("-q", "--farm", dest="farmQueue", -# type="string", default=None, -# help="Farm queue to send processing jobs to") - parser.add_option("", "--header", dest="header", - type='string', default=None, - help="interval_list file") - parser.add_option("-o", "--output", dest="output", - type='string', default=None, - help="output interval_list filename") - parser.add_option("-r", "--ref", dest="ref", - type='string', default=None, - help="reference name -- either hg18 or b36") - parser.add_option("-x", "--exclude", dest="excludes", - action="append", type='string', - help="If provided, only run pipeline for this sample") - parser.add_option("-m", "--maxRecords", dest="maxRecords", - type='int', default=None, - help="If provided, max. number of records to process") - parser.add_option("-s", "--skip", dest="skip", - type='int', default=None, - help="If provided, only process every skip records") - parser.add_option("", "--excludeChr", dest="excludeChrs", - action="append", type='string', - help="If provided, don't include chr matching this string") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - repeatFile = args[0] - - # open the output file - out = open(OPTIONS.output, 'w') - - # write out the header - for line in open(OPTIONS.header): - out.write(line) - - info = dict() - nRecords = 1 - i = 1 - for line in open(repeatFile): - if OPTIONS.maxRecords != None and nRecords > OPTIONS.maxRecords: - break - - if len(line) == 0 or line[0] == '#': - continue - - parts = line.split() - type = parts[TYPE_OFFSET] - start = int(parts[START_OFFSET]) + 1 - end = int(parts[END_OFFSET]) + 1 - chr = parts[CHR_OFFSET] - - if OPTIONS.ref == 'b36': - chr = chr.replace('chrM', 'chrMT') - chr = chr.replace('chr', '') - - if (OPTIONS.excludes == None or type not in OPTIONS.excludes) and not badChr(OPTIONS.excludeChrs, chr): - name = 'repeat_' + str(i) + '_' + type - strand = '+' - - if OPTIONS.skip == None or i % OPTIONS.skip == 0: - print >> out, '\t'.join(map(str, [chr, start, end, strand, name])) - nRecords += 1 - - i += 1 - - if type not in info: - info[type] = RepeatInfo(type) - - typeInfo = info[type] - typeInfo.count += 1 - typeInfo.coveredBases += end - start - - out.close() - - for typeInfo in info.values(): - print "%20s\t%20d\t%20d" % ( typeInfo.name, typeInfo.count, typeInfo.coveredBases ) - -if __name__ == "__main__": - main() - - -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A -# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered - - -# if ( $1 == 3.1 ) then -# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf -# endif -# -# if ( $1 == 4 ) then -# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only ) -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf -# end -# endif -# -# if ( $1 == 5 ) then -# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710 -# -# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf ) -# echo $file -# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710 -# end -# -# endif diff --git a/python/updateTribbleToCorrectSVNVersion.py b/python/updateTribbleToCorrectSVNVersion.py deleted file mode 100644 index d420d609a..000000000 --- a/python/updateTribbleToCorrectSVNVersion.py +++ /dev/null @@ -1,73 +0,0 @@ -# ==================================================================================================== -# update the tribble code to the appropriate version, given the current GATK version. -# this python script looks up the base version of the GATK, checks the date, and cross-references this -# against the Tribble version. -# ==================================================================================================== - -import os, subprocess, re -from subprocess import Popen, PIPE, STDOUT -from datetime import datetime, date, time - -# test that we can use the SVN tool -print "checking that we can use svn and svnversion commands..." -outputSVN = Popen(["svn", "--version"], stdout=PIPE).communicate()[0] -outputSVNVersion = Popen(["svnversion", "--version"], stdout=PIPE).communicate()[0] - -# match the regex -m = re.search('version\s+(\d+\.\d+\.\d+)',outputSVN) -m2 = re.search('version\s+(\d+\.\d+\.\d+)',outputSVNVersion) - -if ((not m) or (not m2)): - raise Exception("Unable to find svn and svnversion commands") - -if (m.group(1) != m.group(1)): - raise Exception('Unable to match versions for svn (version ' + m.group(1) + ') and svnversion (version ' + m2.group(1) + ') commands') - -# check that we're in the base GATK directory -outputSVN = Popen(["svn","info"], stdout=PIPE).communicate()[0] -m = re.search('URL:\s+(.+)\s+',outputSVN) - -print "checking that we're in the base directory of a SVN check-out of the GATK..." -if (not m): - raise Exception("Unable to find current working URL for this directory in SVN; make sure your in the base GATK directory when running this script") -if (m.group(1) != "https://svn/Sting/trunk"): - raise Exception("Not in the correct working directory; we expect to be in a directory pointing to the URL https://svn/Sting/trunk") - -# now get the version and date for the GATK checkout -print "getting the GATK check-out version and date..." -dateMatch = re.search('Last Changed Date:\s+(\S+)\s+(\S+)\s+',outputSVN) -versionMatch = re.search('Last Changed Rev:\s+(.+)\s+(.+)\s+',outputSVN) -if ((not dateMatch) or (not versionMatch)): - raise Exception("Unable to match either the version or the check-out date from the svn -info output: " + outputSVN) - -dt = datetime.strptime(dateMatch.group(1)+" "+dateMatch.group(2), "%Y-%m-%d %H:%M:%S") -print " The date of the current GATK check-out is = " + dt.strftime("%A, %d. %B %Y %I:%M%p") - -# now look through the tribble logs for the last version before the current GATK date -# the log entries look like: r213 | hanna | 2010-09-22 11:28:16 -0400 (Wed, 22 Sep 2010) | 2 lines -outputSVN = Popen(["svn","log","--quiet","tribble"], stdout=PIPE).communicate()[0] -m = re.findall("r(\d+)\s\|\s\S+\s\|\s(\S+\s\S+)",outputSVN) -diff = None -bestRev = 0 -for match in m: - rev = match - tribbleDate = datetime.strptime(match[1] , "%Y-%m-%d %H:%M:%S") - # print tribbleDate.strftime("%A, %d. %B %Y %I:%M%p") - if (dt > tribbleDate): - if (diff == None): - diff = dt - tribbleDate - bestRev = match[0] - elif (dt - tribbleDate < diff): - diff = dt - tribbleDate - bestRev = match[0] - -if (bestRev == 0): - raise Exception("Unable to find correct revision that predates the current GATK checkout...failing") - -# now update the tribble directory to the found revision -print "attempting to update Tribble to the correct version of r" + bestRev -print "" -print "SVN update output:" -print Popen(["svn","update","-r","r"+str(bestRev),"tribble"], stdout=PIPE).communicate()[0] - - diff --git a/python/validatePosterior.py b/python/validatePosterior.py deleted file mode 100644 index 6b9a03fa3..000000000 --- a/python/validatePosterior.py +++ /dev/null @@ -1,47 +0,0 @@ -print("opening...") -ea_vcf = open("/humgen/gsa-hpprojects/analysis/privateMutations/eomi+autism/resources/callsets/eomi+autism/eomi+autism_batch.merged.vcf") -print("reading past header...") -ln = ea_vcf.readline() -import random -while (ln.startswith("#")): - ln = ea_vcf.readline() - -numcalc = 0 -nread = -1 - -def getAC(e): - if ( e.startswith("0/0") ): - return 0 - elif ( e.startswith("0/1") or e.startswith("1/0") ): - return 1 - elif ( e.startswith("1/1") ): - return 2 - else: - print("Warning: "+e) - return 0 - -def calcTrans(line): - spline = line.strip().split("\t") - gtypes = filter(lambda y: y.find("./.") == -1, spline[9:len(spline)]) - if ( len(gtypes) < 1800 ): - return (-1,-1) - random.shuffle(gtypes) - firstAC = reduce(lambda x,y: x + y , map(lambda u: getAC(u),gtypes[0:900])) - if ( firstAC > 5 ): - return (-1,-1) - secondAC = reduce(lambda x,y: x + y, map(lambda u: getAC(u),gtypes[900:1800])) - return (firstAC,secondAC) - -print("Calculating...") -counts = filter(lambda u: u[0] > -1, map(lambda z: calcTrans(z) ,ea_vcf.readlines())) -print("Lines actually processed: %d" % len(counts)) - -cdict = dict() -for c in counts: - if ( not c in cdict ): - cdict[c] = 0 - cdict[c] += 1 - -out = open("posterior_counts.txt",'w') -for c in cdict: - out.write("%d\t%d\t%d\n" % (c[0],c[1],cdict[c])) diff --git a/python/vcf2table.py b/python/vcf2table.py deleted file mode 100755 index 61b9fe0d8..000000000 --- a/python/vcf2table.py +++ /dev/null @@ -1,50 +0,0 @@ -import os.path -import sys -from optparse import OptionParser -from vcfReader import * - -if __name__ == "__main__": - usage = "usage: %prog [file.vcf | if absent stdin] [options]" - parser = OptionParser(usage=usage) - parser.add_option("-f", "--f", dest="fields", - type='string', default=None, - help="Comma separated list of fields to exact") - parser.add_option("-e", "--filter", dest="filter", - action='store_true', default=False, - help="If true, only includes records that aren't filtered in the output") - parser.add_option("-s", "--s", dest="skip", - type='int', default=0, - help="Only print out every 1 / skip records") - parser.add_option("-o", "--output", dest="OUTPUT", - type='string', default=None, - help="Path to output file. stdout if not provided") - - (OPTIONS, args) = parser.parse_args() - if len(args) > 1: - parser.error("incorrect number of arguments") - - counter = OPTIONS.skip - src = sys.stdin - if len(args) > 0: - src = open(args[0]) - - if OPTIONS.fields == None: - sys.exit("Fields argument must be provided") - - out = sys.stdout - if OPTIONS.OUTPUT != None: out = open(OPTIONS.OUTPUT, 'w') - - fields = OPTIONS.fields.split(',') - for header, vcf, count in lines2VCF(src, extendedOutput = True): - #print vcf, count - if count == 1 and vcf.hasHeader(): - print >> out, '\t'.join(fields) - - if counter > 0: - counter -= 1 - else: - counter = OPTIONS.skip - if OPTIONS.filter and vcf.failsFilters(): - pass - else: - print >> out, '\t'.join([ str(vcf.getField(field, '0')) for field in fields]) diff --git a/python/vcfGenotypeToSites.py b/python/vcfGenotypeToSites.py deleted file mode 100644 index 203d51dfc..000000000 --- a/python/vcfGenotypeToSites.py +++ /dev/null @@ -1,38 +0,0 @@ -import sys -genotype_vcf = open(sys.argv[1]) -sites_vcf = open(sys.argv[2],'w') - -sample_name = "ALL_HET" -info = "." -format = "GT" -het = "0/1" -use_fields = range(7) - -line_counter = 0 -print("Reading genotype file...") -for line in genotype_vcf.readlines(): - line_counter += 1 - if ( line.startswith("#") and not line.startswith("#CHR") ): - sites_vcf.write(line) - elif ( line.startswith("#CHR") ): - sites_vcf.write("##source=vcfGenotypeToSites\n") - spline = line.strip().split("\t") - newfields = list() - for i in range(9): - newfields.append(spline[i]) - newfields.append(sample_name) - sites_vcf.write("\t".join(newfields)+"\n") - else: - spline = line.strip().split("\t") - newfields = list() - for i in use_fields: - newfields.append(spline[i]) - newfields.append(info) - newfields.append(format) - newfields.append(het) - sites_vcf.write("\t".join(newfields)+"\n") - if ( line_counter % 100000 == 0 ): - print("Converted: "+str(line_counter)+" lines") - -genotype_vcf.close() -sites_vcf.close() diff --git a/python/vcfReader.py b/python/vcfReader.py deleted file mode 100755 index 033160ff4..000000000 --- a/python/vcfReader.py +++ /dev/null @@ -1,214 +0,0 @@ -import itertools - -VCF_KEYS = "CHROM POS ID REF ALT QUAL FILTER INFO".split() -VCF_KEYS_FORMAT = "CHROM POS ID REF ALT QUAL FILTER".split() - -TRANSITIONS = dict() -for p in ["AG", "CT"]: - for x in [p, ''.join(reversed(p))]: - TRANSITIONS[x.lower()] = True - TRANSITIONS[x] = True - -def is_nan(x): - return type(x) is float and x != x - -def convertToType(chr, pos, d, onlyKeys = None): - out = dict() - types = [int, float, str] - for key, value in d.items(): - if onlyKeys == None or key in onlyKeys: - for type in types: - try: - out[key] = type(value) - if is_nan(out[key]): - #print 'Warning, nan found at %s:%s, using NaN string' % (chr, pos) - out[key] = 'NaN' - break - except: - pass - else: - out[key] = value - return out - -class VCFRecord: - """Simple support for accessing a VCF record""" - def __init__(self, basicBindings, header=None, rest=[], moreFields = dict(), decodeAll = True): - self.header = header - self.info = parseInfo(basicBindings["INFO"]) - chr, pos = basicBindings['CHROM'], basicBindings['POS'] - self.bindings = convertToType(chr, pos, basicBindings, onlyKeys = ['POS', 'QUAL']) - self.bindings.update(moreFields) - if decodeAll: self.info = convertToType(chr, pos, self.info) - self.rest = rest - - def hasHeader(self): return self.header <> None - def getHeader(self): return self.header - - def get(self, key): return self.bindings[key] - - def getChrom(self): return self.get("CHROM") - def getPos(self): return self.get("POS") - def getLoc(self): return str(self.getChrom()) + ':' + str(self.getPos()) - - def getID(self): return self.get("ID") - def isNovel(self): return self.getID() == "." - def isKnown(self): return not self.isNovel() - - def getRef(self): return self.get("REF") - def getAlt(self): return self.get("ALT") - def isVariant(self): - v = self.getAlt() <> '.' - #print 'isVariant', self.bindings, v - return v - - def getQual(self): return self.get("QUAL") - - def getVariation(self): return self.getRef() + self.getAlt() - - def isTransition(self): - #print self.getVariation(), TRANSITIONS - return self.getVariation() in TRANSITIONS - def isTransversion(self): - return not self.isTransition() - - def getFilter(self): return self.get("FILTER") - def failsFilters(self): return not self.passesFilters() - def passesFilters(self): - v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS' - #print self.getFilter(), ">>>", v, self - return v - - - def hasField(self, field): - return field in self.bindings or field in self.info - - def setField(self, field, value): - assert value <> None - - #print 'setting field', field, value - #print 'getInfo', self.getInfo() - if field in self.bindings: - self.bindings[field] = value - else: - self.info[field] = value - #self.setField("INFO", self.getInfo()) - #print 'getInfo', self.getInfo() - - def getField(self, field, default = None): - if field in self.bindings: - return self.get(field) - elif field in self.getInfoDict(): - return self.getInfoKey(field) - else: - return default - - #def getInfo(self): return self.get("INFO") - def getInfo(self): - def info2str(x,y): - if type(y) == bool or x == '.': - return str(x) - else: - return str(x) + '=' + str(y) - v = ';'.join(map(lambda x: info2str(*x), sorted(self.info.iteritems(), key=lambda x: x[0]))) - if v == "": - v = "." - #print 'V = ', v - return v - - def getInfoDict(self): return self.info - - def getInfoKey(self, name, default = None): - info = self.getInfoDict() - if name in info: - return info[name] - else: - return default - - def infoHasKeys(self, keys): - return all(map(lambda key: key in self.getInfo(), keys)) - - def __str__(self): - #return str(self.bindings) + " INFO: " + str(self.info) - return ' '.join(['%s=%s' % (x,y) for x,y in self.bindings.iteritems()]) - - def format(self): - return '\t'.join([str(self.getField(key)) for key in VCF_KEYS_FORMAT] + [self.getInfo()] + self.rest) - -def parseInfo(s): - d = dict() - if s != "": - for elt in s.split(";"): - if '=' in elt: - key, val = elt.split('=') - else: - key, val = elt, 1 - d[key] = val - return d - -# def parseInfo(s): -# def handleBoolean(key_val): -# if len(key_val) == 1: -# return [key_val[0], 1] -# else: -# return key_val -# -# key_val = map( lambda x: handleBoolean(x.split("=")), s.split(";")) -# return dict(key_val) - -def string2VCF(line, header=None, decodeAll = True): - if line[0] != "#": - s = line.split() - bindings = dict(zip(VCF_KEYS, s[0:8])) - moreFields = dict() - #print 'HELLO', header, s, decodeAll - if header <> None and decodeAll: - moreFields = dict(zip(header[8:], s[8:])) - #print header, moreFields - return VCFRecord(bindings, header, rest=s[8:], moreFields = moreFields, decodeAll = decodeAll) - else: - return None - -def readVCFHeader(lines): - header = [] - columnNames = None - for line in lines: - if line[0] == "#": - header.append(line.strip()) - else: - if header <> []: - columnNames = header[-1].strip("#").split() - return header, columnNames, itertools.chain([line], lines) - - # we reach this point for empty files - #print 'header is', header - return header, columnNames, [] - -def quickCountRecords(lines): - counter = 0 - for line in lines: - if line[0] != "#": - counter += 1 - return counter - - -def lines2VCF(lines, extendedOutput = False, decodeAll = True, header=None, columnNames = None): - if header == None: - header, columnNames, lines = readVCFHeader(lines) - counter = 0 - - for line in lines: - if line[0] != "#": - counter += 1 - vcf = string2VCF(line, header=columnNames, decodeAll = decodeAll) - if vcf <> None: - if extendedOutput: - yield header, vcf, counter - else: - yield vcf - raise StopIteration() - -def formatVCF(header, records): - #print records - #print records[0] - return itertools.chain(header, itertools.imap(VCFRecord.format, records)) - diff --git a/python/vcf_b36_to_hg18.py b/python/vcf_b36_to_hg18.py deleted file mode 100755 index 53ae40c2c..000000000 --- a/python/vcf_b36_to_hg18.py +++ /dev/null @@ -1,64 +0,0 @@ -from farm_commands2 import * -import os -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil -import string - -def main(): - global OPTIONS - usage = "usage: %prog [options] b36VCF hg18VCF" - parser = OptionParser(usage=usage) - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-r","--reverse",dest="reverse",action="store_true",default=False,help="If set, will convert the hg18VCF to a b36VCF (thus reversing the functionality)") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 2: - parser.error("incorrect number of arguments") - - b36vcf, hg18vcf = args - - temp = open("tmp", 'w') - mitotemp = open("mtmp",'w') - if not OPTIONS.reverse: - for line in open(b36vcf): - length = len(line) - if length > 2 : - if line[0:2] == 'MT' or line[0] == "#": - if line[0] == "#": - mitotemp.write(line) - else: - spline = line.split("\t") - spline[0] = "chrM" - mitotemp.write("\t".join(spline)) - else: - line = 'chr' + line - temp.write(line) - temp.close() - mitotemp.close() - os.system("cat mtmp tmp > "+hg18vcf+" ; rm mtmp ; rm tmp") - else: - for line in open(hg18vcf): - if line.startswith("#") : - temp.write(line) - else: - spline = line.split("\t") - if ( spline[0] == "chrM" ): - spline[0] = "MT" - mitotemp.write("\t".join(spline)) - else: - spline[0] = spline[0].split("chr")[1] - temp.write("\t".join(spline)) - temp.close() - mitotemp.close() - os.system("cat tmp mtmp > "+b36vcf+" ; rm mtmp ; rm tmp") - -if __name__ == "__main__": - main() diff --git a/ruby/README b/ruby/README deleted file mode 100644 index 3b7740998..000000000 --- a/ruby/README +++ /dev/null @@ -1,9 +0,0 @@ -This file is a roadmap to the contents of the Ruby directory - -Contents: ------------------------------------------------------------------------------------- -restartBamboo.rb - a script used to restart Bamboo after system failures -validateIndex.rb - a script for working with Tribble indexes, type 'validateIndex.rb' -h for help - -utils/ - any basic utility methods -index/ - the index utility code diff --git a/ruby/index/Index.rb b/ruby/index/Index.rb deleted file mode 100644 index e3216420b..000000000 --- a/ruby/index/Index.rb +++ /dev/null @@ -1,70 +0,0 @@ -require './util/BinaryFileReader' -# this base class for all index types (at least linear and tree) -class Index - attr_reader :type, :headerVersion, :fileName, :fileSize, :t5, :md5, :flags - - # construct, given a file name - def initialize(fileName) - @inputFile = fileName - @file = BinaryFileReader.new(fileName) - magic = @file.readBytes(4) - if (magic != "TIDX") - print "#{@inputFile}: !! Magic number is not what we expected, TIDX, instead we saw #{magic} !!\n" - exit(1) - end - @type = @file.readInt() - @headerVersion = @file.readInt() - @fileName = @file.readString() - @fileSize = @file.readLong() - @ts = @file.readLong() - @md5 = @file.readString() - @flags = @file.readUInt() - @seqDict = readSeqDictionary() if (@flags == 32768) - @propCount = readPropertyDictionary() if (@headerVersion >= 3) - end - - def validate() - f = Proc.new{ print "#{@inputFile}:\t\terror: invalid type, we saw #{@type} but expected [1-2]\n"; return false} if @type < 1 or @type > 2 - f = Proc.new{ print "#{@inputFile}:\t\terror: invalid header version, we saw #{@headerVersion} but expected [1-3]\n"; return false} if @headerVersion < 1 or @headerVersion > 3 - f = Proc.new{ print "#{@inputFile}:\t\twarning: on fileName, we saw '#{@fileName}' but expected actual text\n"; return false} if @fileName == "" - f = Proc.new{ print "#{@inputFile}:\t\twarning: on TS, we saw '#{@ts}' but expected actual text\n"; return false} if @ts == "" - f = Proc.new{ print "#{@inputFile}:\t\twarning: on md5, we saw '#{@md5}' but expected actual text\n"; return false} if @md5 == "" - f.call if f != nil - return true - end - - # diff two headers - def diffHeader(otherIndex) - self.instance_variables.each { |var| - next if "#{var}" == "@file" or "#{var}" == "@sequences" - puts "Other header doesn't define #{var}" if !(otherIndex.instance_variable_defined?(var)) - one = (self.instance_variable_get(var)).to_s - two = (otherIndex.instance_variable_get(var)).to_s - puts "type #{var} not equal, #{one} != #{two}" if one != two - } - end - - # read the sequence dictionary, assuming we have one - def readSeqDictionary() - sequences = [] - count = @file.readInt() - count.times {|index| - sequences.add(@file.readString()) - @file.readInt() # drop the sizes for now - } - sequences # return sequences - end - - # read the sequence dictionary, assuming we have one - def readPropertyDictionary() - sequences = {} - count = @file.readInt() - count.times {|index| - sequences.put(@file.readString(),@file.readString()) } - sequences # return sequences - end - - def close() - @file.close() - end -end \ No newline at end of file diff --git a/ruby/index/IntervalIndex.rb b/ruby/index/IntervalIndex.rb deleted file mode 100644 index 5ee307bb0..000000000 --- a/ruby/index/IntervalIndex.rb +++ /dev/null @@ -1,41 +0,0 @@ -# the implementation of the interval index class -$LOAD_PATH << File.dirname(__FILE__) -require "Index.rb" - -class IntervalIndex < Index - def initialize(file) - super(file) - @nSeq = @file.readInt() - @sequences = Array.new() - @nSeq.times {|index| - @sequences.push(TISeqEntry.new(@file)) - } - end - - def diff(otherIndex) - diffHeader(otherIndex) - if (otherIndex.type != @type) - print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n" - return false - end - ret = false - end -end - -class TISeqEntry - def initialize(file) - @contig = file.readString() - @binCount = file.readInt() - @startPositions = Array.new() - @endPositions = Array.new() - @positions = Array.new() - @sizes = Array.new() - @binCount.times { |index| - @startPositions.push(file.readInt()) - @endPositions.push(file.readInt()) - @positions.push(file.readLong()) - @sizes.push(file.readInt()) - } - end -end - diff --git a/ruby/index/LinearIndex.rb b/ruby/index/LinearIndex.rb deleted file mode 100644 index 14b9eee17..000000000 --- a/ruby/index/LinearIndex.rb +++ /dev/null @@ -1,59 +0,0 @@ -# the linear index class implementation -$LOAD_PATH << File.dirname(__FILE__) -require "Index.rb" - -class LinearIndex < Index - attr_accessor :nSeq, :sequences - def initialize(file) - super(file) - @nSeq = @file.readInt() - @sequences = Array.new() - @nSeq.times {|index| - @sequences.push(LISeqEntry.new(@file)) - } - end - - def diff(otherIndex) - diffHeader(otherIndex) - if (otherIndex.type != @type) - print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n" - return false - end - ret = false - notInOther = @sequences.reject {|item| - return true if !otherIndex.sequences.include?(item) - item.diff(otherIndex.sequences[otherIndex.sequences.index(item)]) - } - notInOther.pretty_print - end - - -end - -class LISeqEntry - def initialize(file) - @contig = file.readString() - @binWidth = file.readInt() - @binCount = file.readInt() - @longestFeature = file.readInt() - @maxBin = file.readInt() - @totalBin = file.readInt() - @startPositions = Array.new() - @binCount.times { |index| - @startPositions.push(file.readLong()) - } - @finalPos = file.readLong() - end - - # print a summary of the index characteristics - def diff(otherLISeqEntry) - self.instance_variables.each { |var| - next if "#{var}" == "@file" or "#{var}" == "@sequences" - puts "Other LISeqEntry doesn't define #{var}" if !(otherLISeqEntry.instance_variable_defined?(var)) - one = (self.instance_variable_get(var)).to_s - two = (otherLISeqEntry.instance_variable_get(var)).to_s - puts "otherLISeqEntry: type #{var} not equal, #{one} != #{two}" if one != two - } - end -end - diff --git a/ruby/restartBamboo.rb b/ruby/restartBamboo.rb deleted file mode 100644 index 36b60ded5..000000000 --- a/ruby/restartBamboo.rb +++ /dev/null @@ -1,96 +0,0 @@ -#!/util/bin/ruby -# ######################################################### -# -# Dec 3rd, 2009 -# Aaron -# -# This script will restart Bamboo, if it's running; or if -# it's left a pid file without stopping the server -# -# It has two arguments: -# - the log file to store any results to (required) -# - DRYRUN, do not actually execute commands (optional) -# ######################################################### - -# require file utils, lets us change directory -require 'fileutils' - -# get the time and date, and take off any endlines -td = `date`.chomp! - -# what machine we have to run this on? -machine = "gsa2" - -# check the command line arguments -if (ARGV.size < 1 || ARGV.size > 2) - log_file.puts "#{td}: restartBamboo.rb \n" - log_file.puts "#{td}: logFileLocation: where to put the output" - log_file.puts "#{td}: DRYRUN (optional): do not execute any Bamboo commands, only echo them\nexiting..." - log_file.close() - exit(1) -end - -# bamboo location, shell script, and PID file -bamboo_location = "/humgen/gsa-scr1/gsa-engineering/bamboo/Bamboo" -bamboo_script = "bamboo.sh" -pid_file_loc = "bamboo.pid" - -# where we want to log the results, taken from the command line -log_file = File.open(ARGV[0],"a") - -if (`uname -n`.chomp! != machine) - log_file.puts "#{td}: You can only run this script on machine #{machine}...exiting!" - log_file.close() - exit(1) -end - -echo = "" -# are we a dryrun? -if (ARGV[1] == "DRYRUN") - echo = "echo " -elsif (ARGV.size == 2) - log_file.puts "#{td}: The second parameter must be DRYRUN or nothing!" - log_file.close() - exit(1) -end - -# first check if the bamboo dir exists -if (!File.exists?(bamboo_location)) - log_file.puts "#{td}: Bamboo location: #{bamboo_location} does not exist" - endlog_file.close() - exit(1) -end - -# we have to cd to the directory, since bamboo uses some relative paths (stinks!) -FileUtils.cd(bamboo_location) - -# output our current location -currentDir = FileUtils.pwd() -log_file.puts "current dir = #{currentDir}" - -# check to see if the bamboo location has a pid file -if (File.exists?(bamboo_location + File::SEPARATOR + "bamboo.pid")) - log_file.puts "#{td}: PID file exists! Is bamboo running with that PID..." - - # get the pid - pid = `cat #{pid_file_loc}`.to_i - log_file.puts "#{td}: got a PID value of #{pid}" - - # check for that pid - if (`ps --no-headers #{pid}` == "") - log_file.puts "#{td}: unable to find process #{pid}" - log_file.puts "#{td}: trying to remove pid file: #{pid_file_loc}" - FileUtils.rm(pid_file_loc) - else - log_file.puts "#{td}: found process #{pid}, attempting to shut down..." - shutdown_result = `#{echo}./#{bamboo_script} stop` - log_file.puts "#{td}: shutdown result:\n#{shutdown_result}" - end -end - -# regardless of what happened above, restart the server -startup_result = `#{echo}./#{bamboo_script} start` -log_file.puts "#{td}: startup result:\n#{startup_result}" - -# close the log file -log_file.close() diff --git a/ruby/util/BinaryFileReader.rb b/ruby/util/BinaryFileReader.rb deleted file mode 100644 index b29f90452..000000000 --- a/ruby/util/BinaryFileReader.rb +++ /dev/null @@ -1,44 +0,0 @@ -# a ruby class for reading in binary files; really this just adds some conv. methods like readInt(), readLong(), etc. -class BinaryFileReader - # constructor - def initialize(fileName) - @file = File.open(fileName,"r") - end - - # read and return an int (4 byte, signed, machine based endian) - def readInt() - (@file.sysread(4)).unpack("i")[0] - end - - # read and return an int (4 byte, unsigned, machine based endian) - def readUInt() - (@file.sysread(4)).unpack("L")[0] - end - - # read and return an long (8 byte, signed, machine based endian) - def readLong() - (@file.sysread(8)).unpack("q")[0] - end - - # read and return a set number of bytes as a string - def readBytes(count) - (@file.sysread(count)).to_s - end - - # read and return a null terminated string - def readString() - buffer = [] - ch = @file.sysread(1) - while (ch != "\0") - buffer.push(ch) - ch = @file.sysread(1) - end - buffer.to_s - end - - # close the file - def close() - @file.close() - end -end - diff --git a/ruby/validateIndex.rb b/ruby/validateIndex.rb deleted file mode 100644 index 4aa696911..000000000 --- a/ruby/validateIndex.rb +++ /dev/null @@ -1,103 +0,0 @@ -# this ruby files validates a linear index -# set the include path to include the current directory -$LOAD_PATH << File.dirname(__FILE__) - -# require a couple of files -require "index/Index.rb" -require "index/LinearIndex.rb" -require "index/IntervalIndex.rb" -require "optparse" -require "yaml" - -# This hash will hold all of the options -# parsed from the command-line by -# OptionParser. -options = {} - -optparse = OptionParser.new do|opts| - # Set a banner, displayed at the top - # of the help screen. - opts.banner = "Usage: ruby validateIndex.rb [options] file1 file2 ..." - - # Define the options, and what they do - options[:index] = [] if options[:index] == nil - opts.on( '-i', '--index INDEX (REQUIRED)', 'Specify the index. Multiple are allowed' ) do |file| options[:index].push(file) end - - options[:verbose] = false - opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end - - options[:validate] = false - opts.on( '-c', '--check', 'Check (Validate) the index(es) passed in as parameters' ) do options[:check] = true end - - options[:diff] = false - opts.on( '-d', '--diff', 'Diff two indexes' ) do options[:diff] = true end - - options[:print] = false - opts.on( '-p', '--print', 'Print all of the information about the file' ) do options[:print] = true end - - # This displays the help screen, all programs are - # assumed to have this option. - opts.on_tail( '-h', '--help', 'Display this screen' ) do - puts opts - exit - end -end -# parse the command line -optparse.parse! - -#Now raise an exception if we have not found a host option -if options[:index].size == 0 - puts "you must at least specify an index file!" - puts optparse -end - -# a function to load an index -def loadIndex(file) - indexTry = Index.new(file) - indexTry.close() - if (indexTry.type == 1) - puts "Linear index..." - index = LinearIndex.new(file) - else - puts "Interval index..." - index = IntervalIndex.new(file) - end - index -end - -#################### Control Block #################### - -# load all of the indexes -indexes = [] -options[:index].each {|indexFile| - indexes.push(loadIndex(indexFile)) -} - -# switch on the flags supplied -if (options[:diff]) - if (options[:index].size != 2) - print "Unable to diff indexes if you don't supply two and only two indexes\n"; - exit(1) - else - indexes[0].diff(indexes[1]) - end -elsif (options[:validate]) - indexes.each {|index| index.validate() } -elsif (options[:print]) - indexes.each {|index| puts YAML::dump( index ) } -end - - - - -# if they specified validate -if (options[:check]) - options[:index].each {|index| - idx = Index.new(index).validate() - } -end - - - - -exit diff --git a/scala/qscript/oneoffs/QTools.q b/scala/qscript/oneoffs/QTools.q deleted file mode 100755 index 84d4dbbea..000000000 --- a/scala/qscript/oneoffs/QTools.q +++ /dev/null @@ -1,81 +0,0 @@ -import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFExtractIntervals, VCFExtractSamples, VCFSimpleMerge, VCFExtractSites} -import org.broadinstitute.sting.queue.library.ipf.SortByRef -import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals -import org.broadinstitute.sting.queue.QScript -import collection.JavaConversions._ - -// todo -- should the argument collection on which this runs be generated at compile-time into extensions?? -// todo -- maybe a compile-time generated enum of available library functions? (ipf of course) -class QTools extends QScript { - @Argument(doc="Tool to run",shortName="T", required=true) var qtool : String = _ - @Argument(doc="input VCF",shortName="ivcf",required=false) var inVCF : File = _ - @Argument(doc="input VCF files",shortName="vcfs",required=false) var inVCFs : String = _ - @Argument(doc="output file",shortName="out",required=true) var output : File = _ - @Argument(doc="reference file",shortName="ref",required=false) var ref : File = _ - @Argument(doc="The samples to extract",shortName="sm",required=false) var samples : String = _ - @Argument(doc="Keep filtered sites when merging or extracting?",shortName="kf",required=false) var keepFilters : Boolean = false - @Argument(doc="Input interval list (not used with VCF tools)",shortName="il",required=false) var intervalList : File = _ - @Argument(doc="interval list expand start",shortName="il_start",required=false) var ilStart : Int = 1 - @Argument(doc="interval list expand size",shortName="il_size",required=false) var ilSize : Int = 50 - // todo -- additional arguments or argument collection - - def script = { - if ( qtool.equals("VCFExtractSites") ) { - runVCFExtractSites - } - - if ( qtool.equals("VCFSimpleMerge") ) { - runVCFSimpleMerge - } - - if ( qtool.equals("VCFExtractSamples") ) { - runVCFExtractSamples - } - - if ( qtool.equals("VCFExtractIntervals") ) { - runVCFExtractIntervals - } - - if ( qtool.equals("SortByRef") ) { - runSortByRef - } - - if ( qtool.equals("ExpandTargets") ) { - runExpandTargets - } - } - - def runVCFExtractSites = { - var ves : VCFExtractSites = new VCFExtractSites(inVCF,output) - add(ves) - } - - def runVCFSimpleMerge = { - var vsm : VCFSimpleMerge = new VCFSimpleMerge - vsm.vcfs = inVCFs.split(",").toList.map(new File(_)) - vsm.outVCF = output - vsm.fai = new File(ref.getAbsolutePath+".fai") - - add(vsm) - } - - def runVCFExtractSamples = { - var ves : VCFExtractSamples = new VCFExtractSamples(inVCF,output,samples.split(",").toList) - add(ves) - } - - def runVCFExtractIntervals = { - var vei : VCFExtractIntervals = new VCFExtractIntervals(inVCF,output,keepFilters) - add(vei) - } - - def runSortByRef = { - var sbr : SortByRef = new SortByRef(inVCF,new File(ref.getAbsolutePath+".fai"),output) - add(sbr) - } - - def runExpandTargets = { - var ets : ExpandIntervals = new ExpandIntervals(intervalList,ilStart,ilSize,output,ref,"INTERVALS","INTERVALS") - add(ets) - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/carneiro/QuickCCTest.scala b/scala/qscript/oneoffs/carneiro/QuickCCTest.scala deleted file mode 100755 index 4565a6ce4..000000000 --- a/scala/qscript/oneoffs/carneiro/QuickCCTest.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 3/29/11 - * Time: 5:31 PM - */ -package oneoffs.carneiro; - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ - -class QuickCCTest extends QScript { - qscript => - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - - @Input(shortName="L", required=false) - var intervals: List[String] = Nil - - - val queueLogDir: String = ".qlog/" - - - def script = { - - val recal = new File("recal.csv") - - val cc = new CountCovariates() - cc.reference_sequence = reference - cc.input_file :+= input - cc.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - cc.intervalsString = intervals - cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - cc.scatterCount = 4 - cc.recal_file = recal - cc.memoryLimit = 4 - - add(cc); - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/carneiro/downsampling.scala b/scala/qscript/oneoffs/carneiro/downsampling.scala deleted file mode 100644 index 741b330f2..000000000 --- a/scala/qscript/oneoffs/carneiro/downsampling.scala +++ /dev/null @@ -1,169 +0,0 @@ -package oneoffs.carneiro - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import scala.io.Source._ - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 3/17/11 - * Time: 11:29 AM - * To change this template use File | Settings | File Templates. - */ - - -class downsampling extends QScript { - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="target intervals", shortName="t", required=true) - var targetIntervals: File = _ - - @Input(doc="bootstrap number", shortName="b", required=false) - var bootstrap: Int = 1 - - @Input(doc="downsampling step", shortName="ds", required=true) - var downsamplingStep: Double = _ - - @Input(doc="downsampling floor", shortName="df", required=false) - var downsamplingFloor: Double = 0.0 - - @Input(doc="downsampling ceiling", shortName="dc", required=false) - var downsamplingCeiling: Double = 1.0 - - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="HapMap file", shortName="H", required=false) - var hapmap: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf") - - @Input(doc="Omni file", shortName="O", required=false) - var omni: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf") - - @Input(doc="dbSNP file", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf") - - @Input(doc="project name", shortName="p", required=false) - var base: String = "prj" - - def countLines(file: File):Int = { - var count: Int = 0 - for (l <- fromFile(file).getLines) { - count = count + 1 - } - return count - } - - val queueLogDir: String = ".qlog/" - val outFile: String = "cov.out" - val fullCoverageVCF = new File("/humgen/gsa-hpprojects/dev/carneiro/downsampling/analysis/fullcov/fullcov.F1.filtered.vcf") - val trancheTarget = "99.0" - - def script = { - val nIntervals = math.min(200, countLines(targetIntervals)) - - var f: Double = downsamplingCeiling - var i: Int = 1 - while (f>=downsamplingFloor) { - var b: Int = bootstrap - while(b > 0) { - val file = swapExt(outFile, ".out", ".F" + i + "." + b + ".out") - add(cov(f, file)) - b = b - 1 - } - val snp_out = new File(base + ".F" + i + ".raw.vcf") - val filter_out = new File(base + ".F" + i + ".filtered.vcf") - val eval_out = new File(base + ".F" + i + ".eval") - - add( snps(f, snp_out, nIntervals), - filter(snp_out, filter_out), - eval(filter_out, eval_out)) - - f = f - downsamplingStep - i = i + 1 - } - } - - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals :+= targetIntervals - this.jarFile = GATKjar - this.reference_sequence = reference - this.memoryLimit = 4 - } - - case class cov (fraction: Double, outFile: File) extends Percent20xCoverage with CommandLineGATKArgs { - this.input_file :+= input - this.out = outFile - this.ndrs = true - this.downsample_to_fraction = fraction - this.jobName = queueLogDir + outFile + ".cov" - } - - case class snps (fraction: Double, outFile: File, nIntervals: Int) extends UnifiedGenotyper with CommandLineGATKArgs { - this.memoryLimit = 6 - this.downsample_to_coverage = 600 - this.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP - this.input_file :+= input - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.downsample_to_fraction = fraction - this.scatterCount = nIntervals - this.out = outFile - this.analysisName = outFile + "_snps" - this.jobName = queueLogDir + outFile - } - - case class filter (inFile: File, outFile: File) extends VariantFiltration with CommandLineGATKArgs { - this.filterName ++= List("SNPSBFilter","SNPQDFilter","SNPHRunFilter") - this.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"") - this.clusterWindowSize = 10 - this.clusterSize = 3 - this.variantVCF = inFile - this.out = outFile - this.analysisName = outFile + "_filter" - this.jobName = queueLogDir + outFile - } - - // 3.) Variant Quality Score Recalibration - Generate Recalibration table - case class VQSR(inFile: File, tranchesFiles: File, outFile: File) extends VariantRecalibrator with CommandLineGATKArgs { - this.rodBind :+= RodBind("input", "VCF", inFile) - this.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0") - this.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=true,prior=12.0") - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP, "known=true,training=false,truth=false,prior=10.0") - this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "HRun") - this.tranches_file = tranchesFile - this.recal_file = outFile - this.allPoly = true - this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") - this.analysisName = t.name + "_VQSR" - this.jobName = queueLogDir + outFile - } - - // 4.) Apply the recalibration table to the appropriate tranches - case class applyVQSR (inFile: File, tranchesFiles: File, outFile: File) extends ApplyRecalibration with CommandLineGATKArgs { - this.rodBind :+= RodBind("input", "VCF", inFile) - this.tranches_file = tranchesFile - this.recal_file = inFile - this.ts_filter_level = trancheTarget - this.out = outFile - this.analysisName = outFile + "_AVQSR" - this.jobName = queueLogDir + outFile - } - - case class eval (inFile: File, outFile: File) extends VariantEval with CommandLineGATKArgs { - this.noST = true - this.noEV = true - this.evalModule ++= List("TiTvVariantEvaluator", "CountVariants", "ValidationReport") - this.stratificationModule ++= List("EvalRod", "CompRod", "Novelty") - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.rodBind :+= RodBind("eval", "VCF", inFile) - this.rodBind :+= RodBind("comp", "VCF", fullCoverageVCF) - this.out = outFile - this.analysisName = outFile + "_VariantEval" - this.jobName = queueLogDir + outFile - } -} diff --git a/scala/qscript/oneoffs/carneiro/justClean.scala b/scala/qscript/oneoffs/carneiro/justClean.scala deleted file mode 100755 index 1d0ba9b6d..000000000 --- a/scala/qscript/oneoffs/carneiro/justClean.scala +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 3/17/11 - * Time: 11:29 AM - * To change this template use File | Settings | File Templates. - */ - -import org.broadinstitute.sting.queue.extensions.gatk.{IndelRealigner, RealignerTargetCreator, RodBind} -import org.broadinstitute.sting.queue.QScript - - -class justClean extends QScript { - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - - @Input(doc="extra VCF files to use as reference indels for Indel Realignment", shortName="indels", required=false) - var indels: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf") - - - val queueLogDir: String = ".qlog/" - - - def script = { - - println(GATKjar) - - val outBam = swapExt(input, ".bam", ".clean.bam") - val tIntervals = swapExt(input, ".bam", ".all_indels.intervals") - - val target = new RealignerTargetCreator() - target.input_file :+= input - target.out = tIntervals - target.reference_sequence = reference - target.mismatchFraction = 0.0 - target.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - target.rodBind :+= RodBind("indels", "VCF", indels) - target.memoryLimit = 6 - target.jobName = queueLogDir + tIntervals + ".atarget" - target.jarFile = GATKjar - target.scatterCount = 84 - - - - val clean = new IndelRealigner() - clean.input_file :+= input - clean.targetIntervals = tIntervals - clean.out = outBam - clean.reference_sequence = reference - clean.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - clean.rodBind :+= RodBind("indels", "VCF", indels) - clean.doNotUseSW = false - clean.jobName = queueLogDir + outBam + ".clean" - clean.jarFile = GATKjar - clean.memoryLimit = 8 - clean.scatterCount = 84 - - add(target, clean); - } -} diff --git a/scala/qscript/oneoffs/carneiro/justRecalibrate.scala b/scala/qscript/oneoffs/carneiro/justRecalibrate.scala deleted file mode 100755 index 2d83eb194..000000000 --- a/scala/qscript/oneoffs/carneiro/justRecalibrate.scala +++ /dev/null @@ -1,91 +0,0 @@ -package oneoffs.carneiro - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import net.sf.samtools.SAMFileReader - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 4/20/11 - * Time: 16:29 PM - */ - - -class justRecalibrate extends QScript { - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) - var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") - - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - - val queueLogDir: String = ".qlog/" - var nContigs: Int = 0 - - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - - def script = { - - nContigs = getNumberOfContigs(input) - - val recalFile1: File = new File("recal1.csv") - val recalFile2: File = new File("recal2.csv") - val recalBam: File = swapExt(input, ".bam", "recal.bam") - val path1: String = "before" - val path2: String = "after" - - add(cov(input, recalFile1), - recal(input, recalFile1, recalBam), - cov(recalBam, recalFile2), - analyzeCovariates(recalFile1, path1), - analyzeCovariates(recalFile2, path2)) - } - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = GATKjar - this.reference_sequence = reference - this.memoryLimit = 4 - this.isIntermediate = true - } - - case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - this.input_file :+= inBam - this.recal_file = outRecalFile - this.analysisName = queueLogDir + outRecalFile + ".covariates" - this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = nContigs - } - - case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { - this.input_file :+= inBam - this.recal_file = inRecalFile - this.out = outBam - this.isIntermediate = false - this.analysisName = queueLogDir + outBam + ".recalibration" - this.jobName = queueLogDir + outBam + ".recalibration" - this.scatterCount = nContigs - } - - case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { - this.resources = R - this.recal_file = inRecalFile - this.output_dir = outPath.toString - this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" - this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/carneiro/justRecalibrateEach.scala b/scala/qscript/oneoffs/carneiro/justRecalibrateEach.scala deleted file mode 100755 index 15c71adf6..000000000 --- a/scala/qscript/oneoffs/carneiro/justRecalibrateEach.scala +++ /dev/null @@ -1,116 +0,0 @@ -package oneoffs.carneiro - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 4/20/11 - * Time: 16:29 PM - */ - - -class justRecalibrateEach extends QScript { - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="path to AnalyzeCovariates.jar", fullName="path_to_ac_jar", shortName="ac", required=false) - var ACJar: File = new File("/humgen/gsa-scr1/carneiro/stable/dist/AnalyzeCovariates.jar") - - @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) - var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") - - @Input(doc="bad regions interval", shortName="bad", required=false) - var badInterval: File = new File("/humgen/gsa-hpprojects/dev/carneiro/goodbad/data/bad_regions.hg19.intervals") - - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - - val queueLogDir: String = ".qlog/" - - - def script = { - - val first: Boolean = true - val bad: Boolean = true - - val badRecalFile1: File = new File("bad_recal1.csv"); - val badRecalFile2: File = new File("bad_recal2.csv"); - val badBam: File = new File("bad.bam"); - val badPath1: String = "bad1"; - val badPath2: String = "bad2"; - - val goodRecalFile1: File = new File("good_recal1.csv") - val goodRecalFile2: File = new File("good_recal2.csv") - val goodBam: File = new File("good.bam") - val goodPath1: String = "good1" - val goodPath2: String = "good2" - - add(cov(input, badRecalFile1, first, bad), - recal(input, badRecalFile1, badBam, bad), - cov(badBam, badRecalFile2, !first, bad), - analyzeCovariates(badRecalFile1, badPath1), - analyzeCovariates(badRecalFile2, badPath2)) - - add(cov(input, goodRecalFile1, first, !bad), - recal(input, goodRecalFile1, goodBam, !bad), - cov(goodBam, goodRecalFile2, !first, !bad), - analyzeCovariates(goodRecalFile1, goodPath1), - analyzeCovariates(goodRecalFile2, goodPath2)) - } - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = GATKjar - this.reference_sequence = reference - this.memoryLimit = 4 - this.isIntermediate = true - } - - case class cov (inBam: File, outRecalFile: File, FIRST: Boolean, BAD: Boolean) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - this.input_file :+= inBam - this.recal_file = outRecalFile - this.useOriginalQualities = FIRST - this.analysisName = queueLogDir + outRecalFile + ".covariates" - this.jobName = queueLogDir + outRecalFile + ".covariates" - if (BAD) { - this.intervals :+= badInterval - this.scatterCount = 85 - } - else - this.excludeIntervals :+= badInterval - } - - case class recal (inBam: File, inRecalFile: File, outBam: File, BAD: Boolean) extends TableRecalibration with CommandLineGATKArgs { - this.input_file :+= inBam - this.recal_file = inRecalFile - this.out = outBam - this.isIntermediate = false - this.analysisName = queueLogDir + outBam + ".recalibration" - this.jobName = queueLogDir + outBam + ".recalibration" - if (BAD) { - this.intervals :+= badInterval - this.scatterCount = 85 - } - else - this.excludeIntervals :+= badInterval - } - - case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { - this.jarFile = ACJar - this.resources = R - this.recal_file = inRecalFile - this.output_dir = outPath.toString - this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" - this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/carneiro/mendelianViolation.scala b/scala/qscript/oneoffs/carneiro/mendelianViolation.scala deleted file mode 100755 index 3024046cf..000000000 --- a/scala/qscript/oneoffs/carneiro/mendelianViolation.scala +++ /dev/null @@ -1,56 +0,0 @@ -import java.io.File -import org.broadinstitute.sting.commandline.Argument -import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, RodBind, VariantsToTable} -import org.broadinstitute.sting.queue.QScript - -/* -* Created by IntelliJ IDEA. -* User: carneiro -* Date: 4/12/11 -* Time: 11:24 AM -*/ - -class mendelianViolation extends QScript -{ - - @Argument(shortName="trio", doc="input trio VCF file", required=true) - var trio: File = _ - - @Argument(shortName="daughter", doc="daughter input VCF file", required=true) - var daughter: File = _ - - @Argument(shortName="family", doc="family string", required=false) - var family: String = "NA12891+NA12892=NA12878" - - @Argument(shortName="mvq", doc="mendelian violation quality", required=false) - var mvq: Double = 20 - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=false) - var GATKjar: File = new File("/humgen/gsa-scr1/carneiro/stable/dist/GenomeAnalysisTK.jar") - - def script = { - val reference = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - val trioViolations = "trio_violations.vcf" - val daughterViolations = "daughter_violations.vcf" - - val mv = new SelectVariants() - mv.rodBind :+= RodBind("variant", "VCF", trio) - mv.family = family - mv.reference_sequence = reference - mv.mvq = mvq - mv.out = trioViolations - mv.jarFile = GATKjar - mv.memoryLimit = 4 - - val intersection = new SelectVariants() - intersection.rodBind :+= RodBind("variant", "VCF", daughter) - intersection.rodBind :+= RodBind("conc","VCF", trioViolations) - intersection.reference_sequence = reference - intersection.conc = "conc" - intersection.out = daughterViolations - intersection.jarFile = GATKjar - intersection.memoryLimit = 4 - - add(mv, intersection) - } -} diff --git a/scala/qscript/oneoffs/carneiro/pbCalling.scala b/scala/qscript/oneoffs/carneiro/pbCalling.scala deleted file mode 100755 index 5a304ed63..000000000 --- a/scala/qscript/oneoffs/carneiro/pbCalling.scala +++ /dev/null @@ -1,250 +0,0 @@ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript - -class pbCalling extends QScript { - qscript => - - @Argument(shortName="gatk", doc="gatk jar file", required=true) - var gatkJarFile: File = _ - - @Argument(shortName="outputDir", doc="output directory", required=true) - var outputDir: String = "./" - - - @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) - var datasets: List[String] = Nil - - - class Target( - val baseName: String, - val reference: File, - val dbsnpFile: String, - val hapmapFile: String, - val maskFile: String, - val bamList: File, - val goldStandard_VCF: File, - val intervals: String, - val titvTarget: Double, - val isLowpass: Boolean, - val isCCS: Boolean) { - val name = qscript.outputDir + baseName - val clusterFile = new File(name + ".clusters") - val rawVCF = new File(name + ".raw.vcf") - val filteredVCF = new File(name + ".filtered.vcf") - val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf") - val titvTranchesFile = new File(name + ".titv.tranches") - val recalibratedVCF = new File(name + ".ts.recalibrated.vcf") - val tranchesFile = new File(name + ".ts.tranches") - val cutVCF = new File(name + ".cut.vcf") - val evalFile = new File(name + ".eval") - val goldStandardName = qscript.outputDir + "goldStandard/" + baseName - val goldStandardClusterFile = new File(goldStandardName + ".clusters") - } - - val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") - val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod" - val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod" - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" - val dbSNP_b37_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.rod" // Special case for NA12878 collections that can't use 132 because they are part of it. - val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.hg18_fwd.vcf" - val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b36_fwd.vcf" - val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf" - val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" - val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - val lowPass: Boolean = true - val ccs: Boolean = true - - val targetDataSets: Map[String, Target] = Map( - "HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, !lowPass, !ccs), - "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, !ccs), - "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass, !ccs), - "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people - new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, !ccs), - "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, - new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from - new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass, !ccs), // chunked interval list to use with Queue's scatter/gather functionality - "LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large - new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, !ccs), - "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, !ccs), - "WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/trio/snps/NA12878Trio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, !ccs), - "pacbio" -> new Target("pacbio", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/data/pacbio.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/analisys/snps/pacbio.filtered.vcf"), - "/humgen/gsa-scr1/carneiro/prj/pacbio/data/pacbio.hg19.intervals", 1.8, !lowPass, !ccs), - "pb200" -> new Target("pb200", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/data/pb200.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/analisys/snps/pb200.filtered.vcf"), - "/humgen/gsa-scr1/carneiro/prj/pacbio/data/pb200.hg19.intervals", 1.8, !lowPass, !ccs), - "pb2k" -> new Target("pb2k", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/data/pb2k.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/analisys/snps/pb2k.filtered.vcf"), - "/humgen/gsa-scr1/carneiro/prj/pacbio/data/pb2k.hg19.intervals", 1.8, !lowPass, !ccs), - "cc200" -> new Target("cc200", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/data/cc200.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/analisys/snps/cc200.filtered.vcf"), - "/humgen/gsa-scr1/carneiro/prj/pacbio/data/cc200.hg19.intervals", 1.8, !lowPass, ccs), - "cc2k" -> new Target("cc2k", b37, dbSNP_b37_129, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/data/cc2k.recal.bam"), - new File("/humgen/gsa-scr1/carneiro/prj/pacbio/analisys/snps/cc2k.filtered.vcf"), - "/humgen/gsa-scr1/carneiro/prj/pacbio/data/cc2k.hg19.intervals", 1.8, !lowPass, ccs) - ) - - - def script = { - - // Selects the datasets in the -dataset argument and adds them to targets. - var targets: List[Target] = List() - if (!datasets.isEmpty) - for (ds <- datasets) - targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way? - else // If -dataset is not specified, all datasets are used. - for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values - targets ::= targetDS - - val goldStandard = true - for (target <- targets) { - add(new UnifiedGenotyper(target)) - add(new VariantFiltration(target)) - add(new VQSR(target, !goldStandard)) - add(new applyVQSR(target, !goldStandard)) - add(new VariantCut(target)) - add(new VariantEvaluation(target)) - } - } - - def bai(bam: File) = new File(bam + ".bai") - - val FiltersToIgnore = List("DPFilter", "ABFilter", "ESPStandard", "QualByDepth", "StrandBias", "HomopolymerRun") - - // 1.) Call SNPs with UG - class UnifiedGenotyper(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper { - this.jarFile = gatkJarFile - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 - this.dcov = if ( t.isLowpass ) { 50 } else { 250 } - this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.input_file :+= t.bamList - this.out = t.rawVCF - this.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY - this.analysisName = t.name + "_UG" - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) - // Ridiculous workaround to get pacbio data to run.. never commit this! - this.deletions = 0.5 - this.mbq = 10 - } - - // 2.) Filter SNPs - class VariantFiltration(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.VariantFiltration { - this.jarFile = gatkJarFile - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.scatterCount = 10 - this.variantVCF = t.rawVCF - this.out = t.filteredVCF - this.filterName ++= List("HARD_TO_VALIDATE") - this.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - this.analysisName = t.name + "_VF" - } - - class VQSR(t: Target, goldStandard: Boolean) extends VariantRecalibrator { - this.memoryLimit = 6 - this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } ) - this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile) - if( t.hapmapFile.contains("b37") ) - this.rodBind :+= RodBind("1kg", "VCF", omni_b37) - else if( t.hapmapFile.contains("b36") ) - this.rodBind :+= RodBind("1kg", "VCF", omni_b36) - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) - this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } - this.allPoly = true - this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "1.1", "1.2", "1.5", "1.6", "1.7", "1.8", "1.9", "2.0", "2.1", "2.2", "2.5","3.0", "5.0", "10.0") - } - - class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration { - this.memoryLimit = 4 - this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } ) - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile} - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } - this.fdr_filter_level = 2.0 - this.out = t.recalibratedVCF - } - - // 5.) Variant Cut filter out the variants marked by recalibration to the 99% tranche - class VariantCut(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.ApplyVariantCuts { - this.jarFile = gatkJarFile - this.reference_sequence = t.reference - this.rodBind :+= RodBind("input", "VCF", t.recalibratedVCF ) - this.analysisName = t.name + "_VC" - this.intervalsString ++= List(t.intervals) - this.out = t.cutVCF - this.tranchesFile = t.tranchesFile - this.fdr_filter_level = 1.0 - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) - } - - // 6.) Variant Evaluation based on the sensitivity recalibrated vcf - class VariantEvaluation(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.VariantEval { - this.jarFile = gatkJarFile - val name: String = t.name - this.reference_sequence = t.reference - this.rodBind :+= RodBind("comp", "VCF", t.hapmapFile) - this.rodBind :+= RodBind("eval", "VCF", t.cutVCF) - this.analysisName = name + "_VE" - this.intervalsString ++= List(t.intervals) - this.EV ++= List("GenotypeConcordance") - this.out = t.evalFile - // Ridiculous workaround to get pacbio data to run.. never commit this! - this.sample ++= List("NA12878") - } -} diff --git a/scala/qscript/oneoffs/chartl/BatchMerge.q b/scala/qscript/oneoffs/chartl/BatchMerge.q deleted file mode 100755 index 9009b4656..000000000 --- a/scala/qscript/oneoffs/chartl/BatchMerge.q +++ /dev/null @@ -1,104 +0,0 @@ -import org.broadinstitute.sting.commandline.Hidden -import org.broadinstitute.sting.gatk.walkers.genotyper.{GenotypeLikelihoodsCalculationModel, UnifiedGenotyperEngine} -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals} -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.baq.BAQ -import org.broadinstitute.sting.utils.text.XReadLines - -class batchMergePipeline extends QScript { - batchMerge => - - @Argument(doc="VCF list",shortName="vcfs") var vcfList: File = _ - @Argument(doc="bam list",shortName="bams") var bamList: File = _ - @Argument(doc="sting dir",shortName="sting") var stingDir: String = _ - @Argument(doc="reference file",shortName="ref") var ref: File = _ - @Argument(doc="batched output",shortName="batch") var batchOut: File = _ - //@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false - @Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20 - @Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20 - @Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1 - - def script = { - - var vcfs : List[File] = extractFileEntries(vcfList) - var bams : List[File] = extractFileEntries(bamList) - - trait ExtractArgs extends VCFExtractSites { - this.keepFilters = false - this.keepInfo = false - this.keepQual = false - } - - - - trait CombineVariantsArgs extends CombineVariants { - this.reference_sequence = batchMerge.ref - this.jarFile = new File(batchMerge.stingDir+"/dist/GenomeAnalysisTK.jar") - this.scatterCount = 10 - this.memoryLimit=4 - } - var combine : CombineVariants = new CombineVariants with CombineVariantsArgs - combine.out = swapExt(batchOut,".vcf",".variant.combined.vcf") - combine.rodBind ++= vcfs.map( u => new RodBind(u.getName,"vcf",u) ) - - add(combine) - - var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs) - var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge - combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF) - combineVCFs.fai = new File(ref.getAbsolutePath+".fai") - combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf") - var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combine.out,swapExt(combine.out,".vcf",".intervals.list"),true) - //addAll(getVariantAlleles) - //add(combineVCFs,extractIntervals) - add(extractIntervals) - - trait CalcLikelihoodArgs extends UGCalcLikelihoods { - this.reference_sequence = batchMerge.ref - this.min_base_quality_score = batchMerge.mbq - this.min_mapping_quality_score = batchMerge.mmq - if ( batchMerge.baq >= 0 ) { - this.baqGapOpenPenalty = batchMerge.baq - this.baq = BAQ.CalculationMode.CALCULATE_AS_NECESSARY - } - this.intervals :+= extractIntervals.listOut - this.allelesVCF = combine.out - this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") - this.memoryLimit = 4 - this.scatterCount = 60 - this.output_mode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - this.genotyping_mode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - } - - def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = { - var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs - ugcl.input_file ++= bams._1 - ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2)) - return ugcl - } - - var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u)) - addAll(calcs) - - trait CallVariantsArgs extends UGCallVariants { - this.reference_sequence = batchMerge.ref - this.intervals :+= extractIntervals.listOut - this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") - this.scatterCount = 30 - this.memoryLimit = 8 - this.output_mode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - this.genotyping_mode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - } - - var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs - cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) ) - cVars.out = batchOut - add(cVars) - } - - override def extractFileEntries(in: File): List[File] = { - return (new XReadLines(in)).readLines.toList.map( new File(_) ) - } -} diff --git a/scala/qscript/oneoffs/chartl/BootstrapCalls.q b/scala/qscript/oneoffs/chartl/BootstrapCalls.q deleted file mode 100755 index 915e2e3c7..000000000 --- a/scala/qscript/oneoffs/chartl/BootstrapCalls.q +++ /dev/null @@ -1,185 +0,0 @@ -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType} -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript - -class BootstrapCalls extends QScript { - @Argument(doc="Bam file list",shortName="I",required=true) - var bamList: File = _ - @Argument(doc="Intervals file",shortName="L",required=true) - var intervalFile: File = _ - @Argument(doc="Output file",shortName="o",required=true) - var bootstrapMergedOut: File = _ - @Argument(doc="Reference file",shortName="R",required=true) - var reference: File = _ - @Argument(doc="Downsampling Level",shortName="D",required=false) - var downsamplingLevel: Int = 4 - @Argument(doc="Num Bootstrap Callsets",shortName="B",required=false) - var numberOfBootstraps: Int = 25 - @Argument(doc="call confidence",shortName="conf",required=false) - var standCallConf: Double = 4.0 - @Argument(doc="dbsnp file (vcf version)",shortName="dbsnp",required=false) - var dbsnp: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") - @Argument(doc="sting jar",shortName="s",required=true) - var sting: File = _ - - /********************** - * URGENT NOTE: - * for this to do any good you need to take out the random seeds in - * ReservoirDownsampler: 20 - * MathUtils: 649 - * - * You will also need to hack the recalibrator to always trust AC (which are no longer integer-valued) - * and to deal with double-valued AC fields - */ - - def script = { - val bams: List[File] = extractFileEntries(bamList) - trait UGArgs extends UnifiedGenotyper { - this.input_file = bams - this.reference_sequence = reference - this.dcov = downsamplingLevel - this.intervals :+= intervalFile - this.stand_call_conf = standCallConf - this.stand_emit_conf = standCallConf - this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) - this.scatterCount = 20 - this.jarFile = sting - this.memoryLimit = 4 - } - - val bootstrapBase = swapExt(bootstrapMergedOut,".vcf",".boot%d.vcf").getAbsolutePath - var calls : List[UnifiedGenotyper] = Nil - for ( i <- 0 until (numberOfBootstraps+1) ) { - var ug : UnifiedGenotyper = new UnifiedGenotyper with UGArgs - ug.out = new File(bootstrapBase.format(i)) - ug.analysisName = "Boostrap%d".format(i) - calls :+= ug - } - - addAll(calls) - - trait MergeArgs extends BootstrapCallsMerger { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.scatterCount = 40 - this.jarFile = sting - this.memoryLimit = 4 - this.rodBind ++= calls.map(u => u.out).zipWithIndex.map(u => new RodBind("bootstrap_%d".format(u._2),"vcf",u._1)) - this.out = bootstrapMergedOut - } - - var merge : BootstrapCallsMerger = new BootstrapCallsMerger with MergeArgs - add(merge) - - trait ClusterArgs extends GenerateVariantClusters { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.rodBind :+= new RodBind("input","vcf",merge.out) - this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) - this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) - this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) - this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) - this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") - this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - this.qual = 100 - this.std = 3.5 - this.mG = 8 - this.trustAllPolymorphic = true - this.memoryLimit = 8 - this.jarFile = sting - } - - var clust : GenerateVariantClusters = new GenerateVariantClusters with ClusterArgs - add(clust) - - trait VQSRArgs extends VariantRecalibrator { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.out = swapExt(bootstrapMergedOut,"vcf","recal.vcf") - this.rodBind :+= new RodBind("input","vcf",merge.out) - this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) - this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) - this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) - this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) - this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") - this.sm = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY - this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0") - this.trustAllPolymorphic = true - this.tranchesFile = swapExt(bootstrapMergedOut,"vcf","tranche") - this.memoryLimit=8 - this.jarFile = sting - this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) - } - - var recal : VariantRecalibrator = new VariantRecalibrator with VQSRArgs - add(recal) - - trait CutArgs extends ApplyVariantCuts { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.rodBind :+= new RodBind("input","vcf",recal.out) - this.tranchesFile = recal.tranchesFile - this.fdr_filter_level = 1.0 - this.out = swapExt(bootstrapMergedOut,".vcf",".recal.cut.vcf") - this.jarFile = sting - this.memoryLimit = 4 - this.scatterCount = 5 - } - - var cut : ApplyVariantCuts = new ApplyVariantCuts with CutArgs - add(cut) - - class RmHeader extends CommandLineFunction { - @Input(doc="vcf") var vcf : File = _ - @Output(doc="headerless vcf") var noheadvcf : File = _ - - def commandLine : String = { - "head -n 1 %s > %s ; grep -v \\#\\# %s >> %s".format(vcf.getAbsolutePath,noheadvcf.getAbsolutePath, - vcf.getAbsolutePath,noheadvcf.getAbsolutePath) - } - } - - var rm : RmHeader = new RmHeader - rm.vcf = cut.out - rm.noheadvcf = swapExt(cut.out,".vcf",".nohead.vcf") - add(rm) - - trait CombineArgs extends CombineVariants { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.rodBind :+= new RodBind("loCov","vcf",rm.noheadvcf) - this.rodBind :+= new RodBind("hiCov","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf")) - this.variantMergeOptions = VariantMergeType.UNION - this.genotypeMergeOptions = GenotypeMergeType.PRIORITIZE - this.priority = "hiCov,loCov" - this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.vcf") - this.jarFile = sting - this.memoryLimit = 6 - } - - var combine : CombineVariants = new CombineVariants with CombineArgs - add(combine) - - trait EvalArgs extends VariantEval { - this.reference_sequence = reference - this.intervals :+= intervalFile - this.rodBind :+= new RodBind("evalCombined","vcf",combine.out) - //this.rodBind :+= new RodBind("evalCut","vcf",rm.noheadvcf) - //this.rodBind :+= new RodBind("evalFCP","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf")) - this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) - this.jarFile = sting - this.ST = List("Filter","Novelty","JexlExpression") - this.select_names = List("lowOnly","filteredInLow","Intersection","filteredInHi","hiOnly","filteredInAll") - this.select_exps = List("\"set == 'loCov'\"","\"set == 'hiCov-filterInloCov'\"", - "\"set == 'Intersection'\"", "\"set == 'filterInhiCov-loCov'\"", - "\"set == 'hiCov'\"","\"set == 'FilteredInAll'\"") - this.EV = List("TiTvVariantEvaluator","CountVariants","CompOverlap") - this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.eval") - this.nt = 8 - this.memoryLimit = 12 - } - - var eval : VariantEval = new VariantEval with EvalArgs - add(eval) - } -} diff --git a/scala/qscript/oneoffs/chartl/ExomeVQSR.q b/scala/qscript/oneoffs/chartl/ExomeVQSR.q deleted file mode 100755 index 21d342a99..000000000 --- a/scala/qscript/oneoffs/chartl/ExomeVQSR.q +++ /dev/null @@ -1,219 +0,0 @@ -import org.broadinstitute.sting.commandline.ArgumentSource -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.ListWriterFunction -import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction} -import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals -import org.broadinstitute.sting.queue.QScript -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines - -class FullCallingPipeline extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="G") - var gatkJar: File = _ - - @Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels). By default is set to 20.", shortName="varScatter", required=false) - var num_var_scatter_jobs = 20 - - @Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false) - var expandIntervals = 50 - - private var pipeline: Pipeline = _ - - private final val picardFixMatesClass = "net.sf.picard.sam.FixMateInformation" - - - val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList - val DBSNP : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") - val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list") - val BASE : String = "exon_vqsr" - - val handFiltered : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf") - - - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals :+= INTS - this.jarFile = qscript.gatkJar - this.reference_sequence = REF - this.memoryLimit = Some(4) - } - // ------------ SETUP THE PIPELINE ----------- // - - - def script = { - - endToEnd(BASE,"cleaned") - } - - def endToEnd(base: String, bamType: String) = { - val bamFiles = BAM_FILES - - val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS") - ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out") - - if (qscript.expandIntervals > 0) { - //add(ei) - } - - trait ExpandedIntervals extends CommandLineGATK { - if (qscript.expandIntervals > 0) { - this.intervals :+= ei.outList - } - } - - // Call indels - val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals - indels.analysisName = base + "_indels" - indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out") - indels.memoryLimit = Some(6) - indels.downsample_to_coverage = Some(600) - indels.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL - indels.input_file = bamFiles - indels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP) - indels.out = new File("IndelCalls", base+".indels.vcf") - - indels.scatterCount = qscript.num_var_scatter_jobs - indels.setupScatterFunction = { - case scatter: ScatterFunction => - scatter.commandDirectory = new File("IndelCalls/ScatterGather") - scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out") - } - indels.setupCloneFunction = { - case (clone: CloneFunction, index: Int) => - clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index)) - clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index)) - } - indels.setupGatherFunction = { - case (gather: GatherFunction, source: ArgumentSource) => - gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName)) - gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) - } - - // Filter indels - val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals - filteredIndels.analysisName = base + "_filteredIndels" - filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out") - filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter") - filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"") - filteredIndels.variantVCF = indels.out - filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf") - - // Call snps - val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals - snps.analysisName = base+"_snps" - snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out") - snps.memoryLimit = Some(6) - snps.downsample_to_coverage = Some(600) - snps.input_file = bamFiles - snps.rodBind :+= RodBind("dbsnp", "vcf", DBSNP) - snps.out = new File("SnpCalls", base+".snps.vcf") - - snps.scatterCount = qscript.num_var_scatter_jobs - snps.setupScatterFunction = { - case scatter: ScatterFunction => - scatter.commandDirectory = new File("SnpCalls/ScatterGather") - scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out") - } - snps.setupCloneFunction = { - case (clone: CloneFunction, index: Int) => - clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index)) - clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index)) - } - snps.setupGatherFunction = { - case (gather: GatherFunction, source: ArgumentSource) => - gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName)) - gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) - } - - // Filter snps at indels - val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals - filteredSNPs.analysisName = base+"_filteredSNPs" - filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out") - filteredSNPs.clusterWindowSize = Some(10) - filteredSNPs.clusterSize = Some(3) - filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out) - filteredSNPs.variantVCF = snps.out - filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf") - - // Mako de Clusters - val cr = new ContrastiveRecalibrator with CommandLineGATKArgs with ExpandedIntervals - cr.rodBind :+= new RodBind("input","vcf",filteredSNPs.out) - cr.rodBind :+= new RodBind("dbsnp","vcf",DBSNP,"known=true,training=false,truth=false,prior=8.0") - cr.rodBind :+= new RodBind("hapmap","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"),"known=false,training=true,truth=true,prior=15.0") - cr.rodBind :+= new RodBind("omni","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"),"known=false,training=true,truth=true,prior=12.0") - cr.allPoly = true - cr.use_annotation ++= List("HaplotypeScore","SB","QD","HRun") - cr.tranches_file = new File(base+".tranche") - cr.recal_file = new File(base+".contrastive.recal.table") - cr.tranche ++= List("99.9","99.5","99.25","98.0","97.75","97.65","97.5","97.3","97.2","97.1","98.0","97.5","97.0","96.75","96.5","96.0","95.5","95.0","94.75","94.5","94.25","94.0", - "93.75","93.5","93.25","93.0","92.75","92.5","92.25","92.0","91.0","90.0") - cr.analysisName = base+"_ContrastiveRecalibrator" - cr.memoryLimit = Some(32) - cr.num_threads = Some(6) - - - // Apply the Recalibration - val ar = new ApplyRecalibration with CommandLineGATKArgs with ExpandedIntervals - ar.rodBind :+= new RodBind("input","vcf",filteredSNPs.out) - ar.tranches_file = cr.tranches_file - ar.recal_file = cr.recal_file - ar.ts_filter_level = Some(91.75) - ar.out = new File(base+"_contrastive_recal.91.75.vcf") - ar.memoryLimit = Some(6) - - // Variant eval the standard region - val stdEval = new VariantEval with CommandLineGATKArgs - stdEval.analysisName = base+"_VariantEval" - stdEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.std.out") - stdEval.noST = true - stdEval.noEV = true - stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance") - stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample") - stdEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP) - stdEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out) - stdEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered) - stdEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered) - stdEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf")) - stdEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")) - stdEval.out = swapExt(ar.out, ".vcf", ".eval") - stdEval.num_threads = Some(6) - - // Variant eval the flanking region - val flanksEval = new VariantEval with CommandLineGATKArgs - flanksEval.analysisName = base+"_VariantEval" - flanksEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.flanks.out") - flanksEval.intervals = List(ei.outList) - flanksEval.noST = true - flanksEval.noEV = true - flanksEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance") - flanksEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample") - flanksEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP) - flanksEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out) - flanksEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered) - flanksEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered) - flanksEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf")) - flanksEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")) - flanksEval.out = swapExt(ar.out, ".vcf", ".flanks.eval") - flanksEval.num_threads = Some(6) - - // Make the bam list - val listOfBams = new File("Resources", base +".BamFiles.list") - - val writeBamList = new ListWriterFunction - writeBamList.analysisName = base + "_BamList" - writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out") - writeBamList.inputFiles = bamFiles - writeBamList.listFile = listOfBams - - //add(indels, filteredIndels, snps, filteredSNPs, stdEval, writeBamList,cr,ar) - add(ar,stdEval) - - if (qscript.expandIntervals > 0) { - add(flanksEval) - } - - } -} diff --git a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q deleted file mode 100755 index 17360659b..000000000 --- a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q +++ /dev/null @@ -1,248 +0,0 @@ -import collection.mutable.HashMap -import java.io.{PrintWriter, PrintStream} -import org.broadinstitute.sting.commandline.ArgumentSource -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.QFunction -import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction} -import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.interval.IntervalSetRule -import org.broadinstitute.sting.utils.text.XReadLines -import collection.JavaConversions._ - -class Exome_VQSR_FullSearch extends QScript { - qScript => - - // COMMAND LINE ARGUMENTS - - // VARIABLES USED - val SCRIPT_BASE_NAME = "Exome_VQSR_Search" - val UG_CALL_THRESH = 10.0; - val VQSR_CALL_THRESH = List(10.0,20.0,30.0,40.0,50.0) - val VQSR_ANNOTATIONS_TO_USE = List(List("QD","SB"),List("QD","SB","HRun"),List("QD","SB","HaplotypeScore"),List("QD","SB","HRun","HaplotypeScore")) - val HM3_SITES = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf") - val OMNI_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") - val AXIOM_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf") - val DBSNP_129 = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") - val SENSITIVITY = (new Range(0,19,1)).map(u => 90.0 + 0.5*u).toList - val RECALIBRATE_TOGETHER = List(true,false) - val VQSR_HAPMAP_PRIOR = "15.0" - val VQSR_OMNI_PRIOR = "12.0" - - var VQSR_RODBINDS : HashMap[String,List[RodBind]] = new HashMap[String,List[RodBind]] - val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s" - val VQSR_DBSNP_TAG = "known=true,training=false,truth=false,prior=0.1" - - - for ( tf <- List( (true,false),(false,true),(true,true)) ) { - var mrb: List[RodBind] = Nil - val ext = (if ( tf._1 ) "HT" else "HF") + (if ( tf._2 ) "OT" else "OF") - val hmSt = if ( tf._1 ) "true" else "false" - val omSt = if ( tf._2 ) "true" else "false" - mrb :+= RodBind("dbsnp","VCF",DBSNP_129,VQSR_DBSNP_TAG) - mrb :+= RodBind("HapMap3","VCF",HM3_SITES,VQSR_TAG_FT.format(hmSt,VQSR_HAPMAP_PRIOR)) - mrb :+= RodBind("Omni","VCF",OMNI_CHIP,VQSR_TAG_FT.format(omSt,VQSR_OMNI_PRIOR)) - VQSR_RODBINDS += new Tuple2(ext,mrb) - } - - val BAM_FILES : List[File] = asScalaIterator((new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list")))).map(u => new File(u)).toList - val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list") - val EXPAND_INTS = 40 - val HAND_FILTERED : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf") - val GATK_JAR : File = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - - def script() { - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals :+= INTS - this.jarFile = GATK_JAR - this.reference_sequence = REF - this.memoryLimit = Some(4) - } - - val ei : ExpandIntervals = new ExpandIntervals(INTS,1,EXPAND_INTS, new File("Resources", SCRIPT_BASE_NAME + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS") - ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out") - - if (EXPAND_INTS > 0) { - //add(ei) - } - - trait ExpandedIntervals extends CommandLineGATK { - if (EXPAND_INTS > 0) { - this.intervals :+= ei.outList.getAbsoluteFile - } - } - - val callSNPsAndIndels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals - callSNPsAndIndels.analysisName = SCRIPT_BASE_NAME+"_calls" - callSNPsAndIndels.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out") - callSNPsAndIndels.memoryLimit = Some(6) - callSNPsAndIndels.downsample_to_coverage = Some(600) - callSNPsAndIndels.input_file = BAM_FILES - callSNPsAndIndels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP_129) - callSNPsAndIndels.out = new File(SCRIPT_BASE_NAME+".rawCalls.vcf") - callSNPsAndIndels.stand_call_conf = Some(UG_CALL_THRESH) - - callSNPsAndIndels.scatterCount = 50 - callSNPsAndIndels.setupScatterFunction = { - case scatter: ScatterFunction => - scatter.commandDirectory = new File("SnpCalls/ScatterGather") - scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out") - } - callSNPsAndIndels.setupCloneFunction = { - case (clone: CloneFunction, index: Int) => - clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index)) - clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index)) - } - callSNPsAndIndels.setupGatherFunction = { - case (gather: GatherFunction, source: ArgumentSource) => - gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName)) - gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) - } - - //add(callSNPsAndIndels) - - class ExtractSNPs(in: File, out: File) extends InProcessFunction { - - @Input(doc="foo") - var inputVCF : File = in - @Output(doc="foo") - var outputVCF : File = out - - def canPrint(line: String) : Boolean = { - //System.out.println(line) - if ( line.startsWith("#") ) { return true } - val spline = line.split("\t",6) - //System.out.println(spline.apply(3)+" "+spline.apply(4)) - if ( spline.apply(3).size > 1 || spline.apply(4).size > 1 ) { return false } - true - } - - def run() { - val outWriter = new PrintWriter(new PrintStream(outputVCF)) - asScalaIterator(new XReadLines(inputVCF)).foreach(u => { - if ( canPrint(u) ) { - outWriter.println(u) - } - }) - - outWriter.close - } - } - - val extractSNPs : ExtractSNPs = new ExtractSNPs(callSNPsAndIndels.out,new File(SCRIPT_BASE_NAME+".snpCalls.vcf")) - //add(extractSNPs) - - def getPath(annoList: List[String], jointRecal: Boolean) : File = { - new File("VQSR/%s/%s".format( annoList.reduceLeft( _ + "." + _ ) , if(jointRecal) "together" else "separate") ) - } - - var filterMap : HashMap[Double,File] = new HashMap[Double,File] - - for ( thresh <- VQSR_CALL_THRESH ) { - var filterQual = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals - filterQual.rodBind :+= new RodBind("variant","VCF",extractSNPs.outputVCF.getAbsoluteFile) - filterQual.filterExpression :+= "\'QUAL < %.1f\'".format(thresh) - filterQual.filterName :+= "LowQual" - filterQual.out = new File(SCRIPT_BASE_NAME+".filterQual%.1f.vcf".format(thresh)) - add(filterQual) - filterMap += new Tuple2(thresh,filterQual.out.getAbsoluteFile) - } - - for ( annotations <- VQSR_ANNOTATIONS_TO_USE ) { - for ( recalTogether <- RECALIBRATE_TOGETHER ) { - val directory = getPath(annotations,recalTogether) - for ( call_thresh <- VQSR_CALL_THRESH ) { - for ( vqsr_rb <- VQSR_RODBINDS.iterator ) { - trait VQSR_Args extends VariantRecalibrator { - this.allPoly = true - this.analysisName = "VQSR_%s_%s_%.1f".format( annotations.reduceLeft( _ + "." + _), if ( recalTogether ) "true" else "false", call_thresh) - this.commandDirectory = directory - this.use_annotation ++= annotations - this.tranche ++= SENSITIVITY.map(u => "%.1f".format(u)) - this.rodBind :+= RodBind("inputData","VCF",filterMap.get(call_thresh).get) - this.rodBind ++= vqsr_rb._2 - this.memoryLimit = Some(8) - } - val nameFormat = SCRIPT_BASE_NAME+".%1f.%s.".format(call_thresh,vqsr_rb._1)+"%s." - if ( recalTogether ) { - var vqsr = new VariantRecalibrator with VQSR_Args with ExpandedIntervals with CommandLineGATKArgs - vqsr.tranchesFile = new File(nameFormat.format("both")+"tranche") - vqsr.recalFile = new File(nameFormat.format("both")+"recal") - add(vqsr) - addAll(eval(vqsr, ei.outList, "flanks")) - addAll(eval(vqsr, INTS, "exons")) - } else { - var exons = new VariantRecalibrator with VQSR_Args with CommandLineGATKArgs - exons.tranchesFile = new File(nameFormat.format("exons")+"tranche") - exons.recalFile = new File(nameFormat.format("exons")+"recal") - var flanks = new VariantRecalibrator with VQSR_Args - flanks.intervals :+= ei.outList.getAbsoluteFile - flanks.jarFile = GATK_JAR - flanks.memoryLimit = Some(8) - flanks.reference_sequence = REF - flanks.tranchesFile = new File(nameFormat.format("flanks")+"tranche") - flanks.recalFile = new File(nameFormat.format("flanks")+"recal") - add(exons,flanks) - addAll(eval(exons)) - addAll(eval(flanks)) - } - } - } - } - } - } - - // want to apply and eval - def eval(recal: VariantRecalibrator) : List[QFunction] = { eval(recal,null,"") } - def eval(recal: VariantRecalibrator, list: File, ext: String) : List[QFunction] = { - var functions : List[QFunction] = Nil - trait ImplicitArgs extends CommandLineGATK { - this.jarFile = recal.jarFile - this.reference_sequence = recal.reference_sequence - this.commandDirectory = recal.commandDirectory - if ( list == null ) { - this.intervals ++= recal.intervals - } else { - this.intervals :+= list - } - } - - trait ApplyArgs extends ApplyRecalibration with ImplicitArgs { - this.tranchesFile = recal.tranchesFile - this.recalFile = recal.recalFile - for ( r <- recal.rodBind ) { - if ( r.trackName.startsWith("input") ) { - this.rodBind :+= r - } - } - this.memoryLimit = Some(4) - } - - trait EvalArgs extends VariantEval with ImplicitArgs { - this.stratificationModule = List("Novelty") - this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance") - this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129) - this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP) - this.memoryLimit = Some(4) - } - - val extender = if ( ext != null ) ".cut%.1f."+ext else ".cut%.1f" - for ( sens <- SENSITIVITY ) { - var cut = new ApplyRecalibration with ApplyArgs - cut.analysisName = recal.analysisName+extender.format(sens) - val vcfExt = extender.format(sens)+".vcf" - cut.out = swapExt(cut.recalFile,".recal",vcfExt) - cut.ts_filter_level = Some(sens) - functions :+= cut - - var eval = new VariantEval with EvalArgs - eval.analysisName = cut.analysisName+".eval" - eval.out = swapExt(cut.out,".vcf",".eval") - eval.rodBind :+= RodBind("evalContrastive","VCF",cut.out) - functions :+= eval - } - - functions - } -} diff --git a/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala b/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala deleted file mode 100755 index 372c2f9bd..000000000 --- a/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala +++ /dev/null @@ -1,148 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType - -class Phase1WholeGenome extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="the chromosome to process", shortName="chr", required=true) - var chr: Int = _ - - @Input(doc="output path", shortName="outputDir", required=true) - var outputDir: String = _ - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true) - var outputTmpDir: String = _ - - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI") //,"PPN") - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = Some(3) - this.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - } - - class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int) { - val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps.vcf") - val rawVCFindels = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels.vcf") - - val callSnps = new UnifiedGenotyper with CommandLineGATKArgs - callSnps.out = rawVCFsnps - callSnps.dcov = Some( 50 ) - callSnps.stand_call_conf = Some( 4.0 ) - callSnps.stand_emit_conf = Some( 4.0 ) - callSnps.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY) - callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps" - callSnps.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL) - - val callIndels = new UnifiedGenotyper with CommandLineGATKArgs - callIndels.out = rawVCFindels - callIndels.dcov = Some( 50 ) - callIndels.stand_call_conf = Some( 10.0 ) - callIndels.stand_emit_conf = Some( 10.0 ) - callIndels.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF) - callIndels.glm = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL) - callIndels.minIndelCnt = Some(5) - callIndels.read_filter :+= "Platform454" - callIndels.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels" - callIndels.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL) - callIndels.abort_at_too_much_coverage = Some(4500) - } - - def script = { - val basesPerJob: Int = 700000 - val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1) - var start: Int = 1 - var stop: Int = start - 1 + basesPerJob - if( stop > lastBase ) { stop = lastBase } - var jobNumber: Int = 1 - while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { - callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber) - start += basesPerJob - stop += basesPerJob - if( stop > lastBase ) { stop = lastBase } - jobNumber += 1 - } - } - - - def callThisChunk(interval: String, jobNumber: Int) = { - - val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW","CLM","PUR"), jobNumber) - val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR","ASW"), jobNumber) - val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber) - val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber) - //val PAA = new AnalysisPanel("PAA", List("PPN","YRI","CHB"), jobNumber) - val analysisPanels = List(AFR, ASN, AMR, EUR) - - for( population <- qscript.populations ) { - val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" + population + ".phase1.chr" + qscript.chr.toString + "." + jobNumber.toString + "." - val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_phase1_release/bam_lists/%s.list".format(population)) - val targetIntervals: File = new File(baseTmpName + "target.intervals") - - // 1.) Create cleaning targets - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.memoryLimit = Some(3) - target.input_file :+= bamList - target.intervalsString :+= interval - target.out = targetIntervals - target.mismatchFraction = Some(0.0) - target.maxIntervalSize = Some(700) - target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - target.jobName = baseTmpName + "target" - target.isIntermediate = true - - // 2.) Clean without SW - val clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + "cleaned.bam") - clean.memoryLimit = Some(4) - clean.input_file :+= bamList - clean.intervalsString :+= interval - clean.targetIntervals = targetIntervals - clean.out = cleanedBam - clean.doNotUseSW = true - clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) - clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - clean.jobName = baseTmpName + "clean" - clean.isIntermediate = true - clean.compress = Some(0) - clean.index_output_bam_on_the_fly = Some(true) - clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true - - add(target, clean) - - for( a <- analysisPanels ) { - for( p <- a.pops) { - if( p == population ) { - a.callSnps.input_file :+= cleanedBam - a.callIndels.input_file :+= cleanedBam - } - } - } - } - - for( a <- analysisPanels ) { - a.callSnps.intervalsString :+= interval - a.callIndels.intervalsString :+= interval - add(a.callSnps, a.callIndels) - } - - } -} diff --git a/scala/qscript/oneoffs/chartl/RFAPipeline.q b/scala/qscript/oneoffs/chartl/RFAPipeline.q deleted file mode 100755 index de24571d5..000000000 --- a/scala/qscript/oneoffs/chartl/RFAPipeline.q +++ /dev/null @@ -1,67 +0,0 @@ -import org.broadinstitute.sting.commandline.ArgumentCollection -import org.broadinstitute.sting.gatk.CommandLineGATK -import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection -import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, RFCombine, RFExtractor} -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.IOUtils - -class RFAPipeline extends QScript { - rfapipeline => - - @Argument(doc="bam list",shortName="bams") var bamList: File = _ - @Argument(doc="sting dir",shortName="sting") var stingDir: String = _ - @Argument(doc="reference file",shortName="ref") var ref: File = _ - @Argument(doc="output base",shortName="o") var baseOut: String = _ - @Argument(doc="interval list",shortName="L") var intervals: File = _ - @ArgumentCollection var rfaArgs : RFAArgumentCollection = new RFAArgumentCollection() - @Argument(doc="Number of bams per text file output",shortName="br",required = false) var br = 20 - - def script = { - // step one, break the bam files up - var subBams : Iterator[List[File]] = extractFileEntries(bamList).toList.grouped(br) - - trait ExtractorArgs extends RFExtractor { - this.reference_sequence = rfapipeline.ref - this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar") - this.intervals :+= rfapipeline.intervals - // copy the args into the extractor - this.windowJump = Some(rfaArgs.windowJump) - this.windowSize = Some(rfaArgs.windowSize) - this.fixedZ = Some(rfaArgs.fixedZ) - this.perSampleZ = Some(rfaArgs.sampleZThresh) - this.HighInsertSize= Some(rfaArgs.highInsertSize) - this.LowInsertSize = Some(rfaArgs.lowInsertSize) - this.clippedBases = Some(rfaArgs.clippedBases) - this.sampleEpsilon = Some(rfaArgs.EPSILON) - this.memoryLimit = Some(2) - } - - val extract : List[RFExtractor] = subBams.zipWithIndex.map( u => { - var g = new RFExtractor with ExtractorArgs - g.input_file ++= u._1 - g.out = new File("%s.%d.txt".format(rfapipeline.baseOut,u._2)) - g - }).toList - - addAll(extract) - - trait CombineArgs extends RFCombine { - this.reference_sequence = rfapipeline.ref - this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar") - } - - var combine : RFCombine = new RFCombine with CombineArgs - var idx : Int = 0 - extract.foreach( ex => { - val name = "s%d".format(idx) - val exRB = new RodBind(name,"table",ex.out) - combine.rodBind :+= exRB - combine.memoryLimit = Some(6) - idx+=1; - }) - - combine.out = new File(baseOut) - - add(combine) - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/chartl/RefineGenotypesAndMerge.q b/scala/qscript/oneoffs/chartl/RefineGenotypesAndMerge.q deleted file mode 100755 index df8e70c64..000000000 --- a/scala/qscript/oneoffs/chartl/RefineGenotypesAndMerge.q +++ /dev/null @@ -1,116 +0,0 @@ -import java.io.{FileReader, File, BufferedReader} -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class RefineGenotypesAndMerge extends QScript { - qscript => - - @Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _ - @Argument(doc="Output file to which to write final vcf",required=true,shortName="out") var outVCF: File = _ - @Argument(doc="Path to GATK jar",required=true,shortName="gatk") var gatkJar: File = _ - @Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _ - @Argument(doc="Reference file",required=true,shortName="ref") var reference: File = _ - - trait GATKArgs extends CommandLineGATK { - this.reference_sequence = qscript.reference - this.jarFile = qscript.gatkJar - } - - class GunzipFile(in: File, out:File ) extends CommandLineFunction { - @Input(doc="file to gunzip") var inp = in - @Output(doc="file to gunzip to") var outp = out - - def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath) - } - - class BeagleRefinement extends CommandLineFunction { - @Input(doc="The beagle input file") var beagleInput: File = _ - var beagleOutputBase: String = _ - var beagleMemoryGigs: Int = 4 - - /** - * Note: These get set - */ - @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ - @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ - @Output(doc="The beagle r2 file") var beagleRSquared: File = _ - var beagleOutputDir: String = _ - - def freezeOutputs = { - if ( beagleInput.getParent == null ) { - beagleOutputDir = "" - } else { - beagleOutputDir = beagleInput.getParent - } - beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz") - beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") - beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2") - } - - def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,beagleOutputBase) - } - - def RefineGenotypes(inputVCF: File, outputVCF: File, beagleBase: String ) : List[CommandLineFunction] = { - var commands: List[CommandLineFunction] = Nil - - var beagleInput = new ProduceBeagleInput with GATKArgs - beagleInput.variantVCF = inputVCF - beagleInput.out = swapExt(inputVCF,".vcf",".beagle") - - var refine = new BeagleRefinement - refine.beagleInput = beagleInput.out - refine.beagleOutputBase = beagleBase - refine.beagleMemoryGigs = 6 - refine.memoryLimit = 6 - refine.freezeOutputs - - var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) - var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) - - var vcfConvert = new BeagleOutputToVCF with GATKArgs - vcfConvert.variantVCF = inputVCF - vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) - vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp) - vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp) - vcfConvert.out = outputVCF - - commands :+= beagleInput - commands :+= refine - commands :+= unzipPhased - commands :+= unzipProbs - commands :+= vcfConvert - - return commands - } - - def mergeVCFs(vcfs: List[File], outputVCF: File) : CombineVariants = { - var cv = new CombineVariants with GATKArgs - cv.out = outputVCF - cv.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED - cv.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION - cv.priority = (vcfs.foldLeft[List[String]](Nil)( (bNames,vcf) => bNames ::: List[String](swapExt(vcf,".vcf","").getName))).mkString(",") - cv.rodBind = vcfs.foldLeft[List[RodBind]](Nil)( (rods,vcf) => rods ::: List[RodBind](new RodBind(swapExt(vcf,".vcf","").getName,"VCF",vcf))) - - return cv - } - - def script = { - var vcfsToMerge: List[File] = Nil - for ( i <- 0 until vcfsToBeagle.size ) { - val base = vcfsToBeagle.get(i).getName+".bout" - val refine_out = swapExt(vcfsToBeagle.get(i),".vcf",".refined.vcf") - vcfsToMerge :+= refine_out - for ( c <- RefineGenotypes(vcfsToBeagle.get(i),refine_out,base) ) { - add(c) - } - } - - add(mergeVCFs(vcfsToMerge,outVCF)) - } -} diff --git a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q b/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q deleted file mode 100755 index 7c896fd4a..000000000 --- a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q +++ /dev/null @@ -1,123 +0,0 @@ -import org.broadinstitute.sting.commandline.{Argument, Output, Input} -import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK} -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.text.XReadLines -import collection.JavaConversions._ - -class ScatterGatherAssociation extends QScript { - - @Argument(fullName="gatkJar",shortName="gatk",doc="Path to the GATK jarfile",required=true) - var gatkJar : File = _ - @Argument(fullName="metaData",shortName="SM",doc="Sample meta data",required=true) - var metaData : File = _ - @Argument(fullName="bamList",shortName="I",doc="list of bam files (single .list file)",required=true) - var bamList : File = _ - @Argument(fullName="outputBase",shortName="o",doc="Base for output files",required=true) - var outBase : String = _ - @Argument(fullName="noBedGraph",shortName="nbg",doc="Don't use bedgraph format",required=false) - var dontUseBedGraph : Boolean = false - @Argument(fullName="reference",shortName="R",doc="Reference file, if not hg19",required=false) - var referenceFile : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - @Argument(fullName="intervals",shortName="L",doc="Interval list, if not whole-exome 1.1",required=false) - var intervalsFile : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list") - @Argument(fullName="memoryLimit",shortName="M",doc="Memory limit for SG jobs",required=false) - var memLimit : Int = 4 - @Argument(fullName="scatterJobs",shortName="SJ",doc="Number of scatter jobs",required=false) - var scatterJobs : Int = 125 - - val ASSOCIATION_TESTS = List("BaseQualityScore","InsertSizeDistribution","MappingQuality0", - "MateMappingQuality","MateOtherContig","MateSameStrand","MateUnmapped","MismatchRate", - "ProperPairs","ReadClipping","ReadIndels","ReadMappingQuality","ReferenceMismatches", - "SampleDepth") - - class RegionalAssociationSG(base : String, ext : String) extends CommandLineGATK with ScatterGatherableFunction{ - this.analysis_type = "RegionalAssociation" - - @Argument(doc="useBed") - var useBed : Boolean = true - - // the rest are output files implicitly constructed by the multiplexer - - @Output(doc="bqs") - @Gather(classOf[SimpleTextGatherFunction]) - var bqs : File = new File(String.format("%s.%s.%s", base, "BaseQualityScore", ext)) - /* - @Output(doc="isd") - @Gather(classOf[SimpleTextGatherFunction]) - var isd : File = new File(String.format("%s.%s.%s",base,"InsertSizeDistribution",ext)) - @Output(doc="mq0") - @Gather(classOf[SimpleTextGatherFunction]) - var mq0 : File = new File(String.format("%s.%s.%s",base,"MappingQuality0",ext)) - @Output(doc="mmq") - @Gather(classOf[SimpleTextGatherFunction]) - var mmq : File = new File(String.format("%s.%s.%s",base,"MateMappingQuality",ext)) - @Output(doc="moc") - @Gather(classOf[SimpleTextGatherFunction]) - var moc : File = new File(String.format("%s.%s.%s",base,"MateOtherContig",ext)) - @Output(doc="mss") - @Gather(classOf[SimpleTextGatherFunction]) - var mss : File = new File(String.format("%s.%s.%s",base,"MateSameStrand",ext)) - /@Output(doc="mu") - @Gather(classOf[SimpleTextGatherFunction]) - var mu : File = new File(String.format("%s.%s.%s",base,"MateUnmapped",ext)) - @Output(doc="mmr") - @Gather(classOf[SimpleTextGatherFunction]) - var mmr : File = new File(String.format("%s.%s.%s",base,"MismatchRate",ext)) - @Output(doc="pp") - @Gather(classOf[SimpleTextGatherFunction]) - var pp : File = new File(String.format("%s.%s.%s",base,"ProperPairs",ext)) - @Output(doc="rc") - @Gather(classOf[SimpleTextGatherFunction]) - var rc : File = new File(String.format("%s.%s.%s",base,"ReadClipping",ext)) - @Output(doc="ri") - @Gather(classOf[SimpleTextGatherFunction]) - var ri : File = new File(String.format("%s.%s.%s",base,"ReadIndels",ext)) - @Output(doc="rmq") - @Gather(classOf[SimpleTextGatherFunction]) - var rmq : File = new File(String.format("%s.%s.%s",base,"ReadMappingQuality",ext)) - @Output(doc="rm") - @Gather(classOf[SimpleTextGatherFunction]) - var rm : File = new File(String.format("%s.%s.%s",base,"ReferenceMismatches",ext)) - @Output(doc="sd") - @Gather(classOf[SimpleTextGatherFunction]) - var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext)) - @Output(doc="rai") - @Gather(classOf[SimpleTextGatherFunction]) - var rli : File = new File(String.format("%s.%s.%s",base,"ReadsAberrantInsertSize",ext)) - @Output(doc="rwi") - @Gather(classOf[SimpleTextGatherFunction]) - var rwi : File = new File(String.format("%s.%s.%s",base,"ReadsWithIndels",ext)) - */ - - override def commandLine = { - var bedStr : String = "" - if ( useBed ) { - bedStr = " -bg " - } - super.commandLine + " -AT ALL -o %s%s".format(base,bedStr) - } - } - - def script = { - - var ext : String = "" - if ( dontUseBedGraph ) { - ext = "tdf" - } else { - ext = "bedgraph" - } - - var association = new RegionalAssociationSG(outBase,ext) - association.useBed = ! dontUseBedGraph - association.sample_metadata :+= metaData - association.intervals :+= intervalsFile - association.reference_sequence = referenceFile - association.jarFile = gatkJar - association.input_file ++= asScalaIterable((new XReadLines(bamList)).readLines).map(u => new File(u)).toList - association.scatterCount = scatterJobs - association.memoryLimit = Some(memLimit) - association.scatterClass = classOf[IntervalScatterFunction] - - add(association) - } -} diff --git a/scala/qscript/oneoffs/chartl/expanded_targets.q b/scala/qscript/oneoffs/chartl/expanded_targets.q deleted file mode 100755 index 291fac6cb..000000000 --- a/scala/qscript/oneoffs/chartl/expanded_targets.q +++ /dev/null @@ -1,119 +0,0 @@ -import org.broadinstitute.sting.commandline.ArgumentCollection -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals -import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.text.XReadLines -import collection.JavaConversions._ - -class expanded_targets extends QScript { - @ArgumentCollection var args : PipelineArgumentCollection = new PipelineArgumentCollection - @Argument(shortName="bait",doc="The list of baits associated with the target list",required=false) var baitFile : File = _ - @Argument(shortName="thisTrigger",doc="The trigger track to use",required=false) var thisTrigger : File = new File("/humgen/gsa-hphome1/chartl/projects/exome/expanded/triggers/joined.omni.hiseq.vcf") - - def script = { - // note : bam sorting and indexing handled outside of this script - // GATK hacked by modifying GenomeLocSortedSet not to fuss about getting multiple instances of the same interval - - val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => { - new ExpandIntervals(args.projectIntervals,1+5*u,5,new File("./"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"TSV","INTERVALS") - }) - - trait GATKArgs extends CommandLineGATK { - this.reference_sequence = args.projectRef - this.DBSNP = args.projectDBSNP - this.jarFile = args.gatkJar - } - - val userDir = "." - - addAll(intervalExpands) - - val cleanIntervals : ExpandIntervals = new ExpandIntervals(args.projectIntervals,1,210,new File(userDir+"/"+args.projectName+"_expanded_full.interval_list"),args.projectRef,"TSV","INTERVALS") - - add(cleanIntervals) - - val uncleanBams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u)) - val realign : List[RealignerTargetCreator] = uncleanBams.map(u => { - var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs - rtc.out = swapExt(userDir,u,".bam",".clean.targets.interval_list") - rtc.input_file :+= u.getAbsoluteFile - rtc.intervals :+= cleanIntervals.outList - rtc.memoryLimit = 6 - rtc - }) - val clean : List[IndelRealigner] = realign.map( u => { - var cleaner : IndelRealigner = new IndelRealigner with GATKArgs - cleaner.targetIntervals = u.out - cleaner.input_file = u.input_file - cleaner.memoryLimit = 6 - cleaner.out = new File(userDir+"/"+swapExt(u.out,".bam",".expanded.targets.bam").getName) - cleaner.intervals :+= cleanIntervals.outList - cleaner - }) - - addAll(realign) - addAll(clean) - - val callFiles: List[File] = intervalExpands.map(u => makeCalls(u.outList,clean.map(h => swapExt(h.out,".bam",".sorted.bam")))) - - } - - def makeCalls(iList: File, bams: List[File]): File = { - - trait GATKArgs extends CommandLineGATK { - this.reference_sequence = args.projectRef - this.DBSNP = args.projectDBSNP - this.jarFile = args.gatkJar - this.intervals :+= iList - } - - var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs - call.input_file = bams - call.out = swapExt(iList,".interval_list",".raw.vcf") - call.trig_emit_conf = 0.0 - call.rodBind :+= new RodBind("trigger","vcf",thisTrigger) - call.scatterCount = 10 - call.memoryLimit = 6 - var filter : VariantFiltration = new VariantFiltration with GATKArgs - filter.rodBind :+= new RodBind("variant","vcf",call.out) - filter.filterExpression :+= "\"QD<5.0\"" - filter.filterName :+= "LowQualByDepth" - filter.filterExpression :+= "\"SB>-0.10\"" - filter.filterName :+= "HighStrandBias" - filter.out = swapExt(iList,".interval_list",".filtered.vcf") - var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs - callHiseq.reference_sequence = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - callHiseq.input_file = List(new File("/seq/picard_aggregation/EXT1/NA12878/v3/NA12878.bam")) - callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out) - callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf") - callHiseq.trig_emit_conf = 0.0 - callHiseq.scatterCount = 5 - - add(call,filter,callHiseq) - - var eval : VariantEval = new VariantEval with GATKArgs - eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out) - eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf")) - eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out) - eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf")) - eval.out = swapExt(iList,".interval_list",".eval") - //eval.reportType = org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV - eval.memoryLimit = 4 - - add(eval) - eval.out - - } - - class B37_to_HG19 extends CommandLineFunction { - @Input(doc="vcf") var vcf : File = _ - @Output(doc="out") var outVCF : File = _ - - def commandLine = "python /humgen/gsa-hphome1/chartl/sting/python/vcf_b36_to_hg18.py %s %s".format(vcf.getAbsolutePath,outVCF.getAbsolutePath) - } - - class HG19_to_B37 extends B37_to_HG19 { - override def commandLine = "python /humgen/gsa-hphome1/chartl/sting/python/vcf_b36_to_hg18.py -r %s %s".format(vcf.getAbsolutePath,outVCF.getAbsolutePath) - } -} diff --git a/scala/qscript/oneoffs/chartl/fullCallingPipelineV2.q b/scala/qscript/oneoffs/chartl/fullCallingPipelineV2.q deleted file mode 100755 index 48c92418b..000000000 --- a/scala/qscript/oneoffs/chartl/fullCallingPipelineV2.q +++ /dev/null @@ -1,64 +0,0 @@ -import org.broadinstitute.sting.commandline.ArgumentCollection -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK -import org.broadinstitute.sting.queue.pipeline._ -import org.broadinstitute.sting.queue.util.PipelineUtils -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class fullCallingPipelineV2 extends QScript { - fcp => - - @ArgumentCollection var pipelineArgs = new PipelineArgumentCollection - - private var pipeline: Pipeline = _ - - def script = { - pipelineArgs.verifyArguments - pipeline = PipelineUtils.loadPipelineFromPAC(pipelineArgs) - var callingLib: VariantCalling = new VariantCalling(pipeline,fcp.pipelineArgs.gatkJar) - var cleaningLib: BamProcessing = new BamProcessing(pipeline,fcp.pipelineArgs.gatkJar,fcp.pipelineArgs.picardFixMatesJar) - - val projectBase: String = fcp.pipeline.getProject.getName - val cleanedBase: String = projectBase + ".cleaned" - val uncleanedBase: String = projectBase + ".uncleaned" - - // there are commands that use all the bam files - val recalibratedSamples = fcp.pipeline.getSamples.filter( u => ( u.getBamFiles.contains("recalibrated") || u.getBamFiles.contains("cleaned") ) ) - - var bamsToClean: List[(File,File)] = Nil - var recalBams: List[File] = Nil - var cleanedBams: List[File] = Nil - - for ( sample <- recalibratedSamples ) { - val bam = sample.getBamFiles.get("recalibrated") - recalBams :+= bam - if (!sample.getBamFiles.contains("cleaned")) { - sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam")) - bamsToClean :+= (bam,sample.getBamFiles.get("cleaned")) - } - - cleanedBams :+= sample.getBamFiles.get("cleaned") - } - - if ( !fcp.pipelineArgs.skip_cleaning ) { - addAll(cleaningLib.StandardIndelRealign(bamsToClean,fcp.pipelineArgs.cleaningJobs)) - } - - if (!fcp.pipelineArgs.skip_cleaning ) { - endToEnd(cleanedBase, cleanedBams, callingLib) - } else { - endToEnd(uncleanedBase, recalBams, callingLib) - } - } - - - def endToEnd(base: String, bamFiles: List[File], lib: VariantCalling) = { - var recal_vcf = new File(base+"_snps.recal.annotated.tranched.vcf") - var handfilt_vcf = new File(base+"_snps.handfiltered.annotated.vcf") - var indel_vcf = new File(base+"_indel_calls.vcf") - - addAll(lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,fcp.pipelineArgs.target_titv,fcp.pipelineArgs.refseqTable)) - } -} diff --git a/scala/qscript/oneoffs/chartl/old/private_mutations_old.q b/scala/qscript/oneoffs/chartl/old/private_mutations_old.q deleted file mode 100755 index fc38efe60..000000000 --- a/scala/qscript/oneoffs/chartl/old/private_mutations_old.q +++ /dev/null @@ -1,408 +0,0 @@ -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils -import org.broadinstitute.sting.queue.extensions.gatk.{UnifiedGenotyper, RodBind, CombineVariants, SelectVariants} -import org.broadinstitute.sting.queue.QScript -import tools.nsc.io.File - -class private_mutations_old extends QScript { - pm_script => // alias for the script arguments - - val eomi_merged_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_merged.vcf") - val g1k_exome_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/g1k_exomes.vcf") - val ceu_pilot_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/CEU.low_coverage.genotypes.vcf") - - val SINGLE_SAMPLE_EXPRESSION = "\"PASS.*AC=1;|PASS.*AC=2;.*1/1|PASS.*AC=2.*1\\|1\"" - val NOVEL_VARIANT_EXPRESSION = "-v \"rs[0-9]\"" - val KNOWN_VARIANT_EXPRESSION = "\"rs[0-9].*PASS\"" - - val BASE_DIR = "/humgen/gsa-hphome1/chartl/projects/private_mutations/queue/" - val LIST_DIR = BASE_DIR + "lists/" - val VCF_DIR = BASE_DIR + "vcfs/" - val EVAL_DIR = BASE_DIR + "evals/" - val SCRATCH_DIR = BASE_DIR + "intermediate/" - - /* - * Steps of analysis. - * - * todo -- 1) Calculate number of EOMI calls top-down, cumulative include/exclude that are: - * - present in one sample - * - not in dbsnp 129 - * - not in 1000G pilot CEU - * - not in 1000G production EUR - * - not in 1000G production exome EUR - * - * todo -- 2) Distributional aspects of remaining mutations - * - Depth of Coverage - * - Functional Class - * - * todo -- 3) 1000G Exome v Lowpass -- Venn of single-sample variants - * - Venn numbers - * - Distribution of depth for exome variants missed in low-pass - * - * todo -- 4) Estimate relationship between depth and sensitivity to private variation - * - * todo -- 5) Calculate % of exome empowered to detect private variation - * - * todo -- 6) Identify loci across 1000G Ex and EOMI that are low-powered - * - venn of low-power targets - * - mapping the intersection - * - genes affected by low power - */ - - - /* - * Scripting commands go here - */ - def script = { - // setup analysis resources (best done in loop) - val g1k_lowpass_chr : List[File] = (1 to 22).toList.map( i => new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chr%d/ALL.august.chr%d.recal.vrcut.EUR.vcf".format(i,i))) - val eomi_annot_set : List[File] = (1 to 7).toList.map( i => new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_set%d.vcf".format(i))) - val g1k_lowpass_hg19_merged : File = new File(VCF_DIR+"g1k_lowpass_merged.hg19.sites.vcf") - val eomi_merged_hg19 : File = eomi_merged_calls - val ceu_hg19 : File = new File(VCF_DIR+"G1K_Pilot.CEU.hg19.vcf") - - combineAndMergeSites(g1k_lowpass_chr,g1k_lowpass_hg19_merged) - liftoverVCF(ceu_pilot_calls,ceu_hg19) - - - - // get EOMI ss-only list - var eomi_single_sample : Vcf2List = new Vcf2List - eomi_single_sample.in_vcf = eomi_merged_hg19 - eomi_single_sample.filter = SINGLE_SAMPLE_EXPRESSION - eomi_single_sample.out_list = new File(LIST_DIR+"EOMI_single_sample_variants.sites.list") - - add(eomi_single_sample) - - var eomi_dbsnp : Vcf2List = new Vcf2List - eomi_dbsnp.in_vcf = eomi_merged_hg19 - eomi_dbsnp.filter = KNOWN_VARIANT_EXPRESSION - eomi_dbsnp.out_list = new File(LIST_DIR+"EOMI_novel_variants.sites.list") - - add(eomi_dbsnp) - - var ceu_sites : Vcf2List = new Vcf2List - ceu_sites.in_vcf = ceu_hg19 - ceu_sites.out_list = new File(SCRATCH_DIR+"G1KP_CEU_variants.sites.list") - - add(ceu_sites) - - var g1k_lowpass_sites : Vcf2List = new Vcf2List - g1k_lowpass_sites.in_vcf = g1k_lowpass_hg19_merged - g1k_lowpass_sites.out_list = new File(SCRATCH_DIR+"G1K_lowpass_EUR.sites.list") - - add(g1k_lowpass_sites) - - var g1k_exome_sites : Vcf2List = new Vcf2List - g1k_exome_sites.in_vcf = g1k_exome_calls - g1k_exome_sites.out_list = new File(SCRATCH_DIR+"G1K_exomes.sites.list") - - add(g1k_exome_sites) - - var remove_dbsnp : RemoveIntersect = new RemoveIntersect - remove_dbsnp.fileToUnique = eomi_single_sample.out_list - remove_dbsnp.comparisonFile = eomi_dbsnp.out_list - remove_dbsnp.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.list") - - add(remove_dbsnp) - - var remove_ceu : RemoveIntersect = new RemoveIntersect - remove_ceu.fileToUnique = remove_dbsnp.outputFile - remove_ceu.comparisonFile = ceu_sites.out_list - remove_ceu.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.list") - - add(remove_ceu) - - var remove_lowpass : RemoveIntersect = new RemoveIntersect - remove_lowpass.fileToUnique = remove_ceu.outputFile - remove_lowpass.comparisonFile = g1k_lowpass_sites.out_list - remove_lowpass.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.list") - - add(remove_lowpass) - - var remove_exome : RemoveIntersect = new RemoveIntersect - remove_exome.fileToUnique = remove_lowpass.outputFile - remove_exome.comparisonFile = g1k_exome_sites.out_list - remove_exome.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.noexome.list") - - add(remove_exome) - - var subset : SelectVariants = new SelectVariants - subset.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - subset.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - subset.variantVCF = eomi_merged_hg19 - subset.out = new File(VCF_DIR+"EOMI_merged.hg19.private.vcf") - subset.intervals :+= remove_exome.outputFile - - add(subset) - - var varDepths : GetVariantDepths = new GetVariantDepths - varDepths.inVCF = subset.out - varDepths.outFile = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/results/EOMI.private.variant.depths.txt") - - add(varDepths) - - var lowpass_single : Vcf2List = new Vcf2List - lowpass_single.in_vcf = g1k_lowpass_hg19_merged - lowpass_single.filter = SINGLE_SAMPLE_EXPRESSION - lowpass_single.out_list = new File(LIST_DIR+"g1k_lowpass_single_sample.variants.list") - - add(lowpass_single) - - var exome_single : Vcf2List = new Vcf2List - exome_single.in_vcf = g1k_exome_calls - exome_single.filter = SINGLE_SAMPLE_EXPRESSION - exome_single.out_list = new File(LIST_DIR+"g1k_exome_single_sample.variants.list") - - add(exome_single) - - var exome_extract : ExtractSites = new ExtractSites(g1k_exome_calls,new File(VCF_DIR+"g1k_exome.sites.vcf")) - add(exome_extract) - - var getVennExS : CombineVariants = new CombineVariants - getVennExS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged); - getVennExS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF); - getVennExS.priority = "exome,lowpass" - getVennExS.intervals :+= exome_single.out_list - getVennExS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - getVennExS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - getVennExS.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY - getVennExS.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION - getVennExS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.exome.sites.vcf") - - //add(getVennExS) - - var getVennLPS : CombineVariants = new CombineVariants - getVennLPS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged); - getVennLPS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF); - getVennLPS.priority = "exome,lowpass" - getVennLPS.intervals :+= lowpass_single.out_list - getVennLPS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - getVennLPS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - getVennLPS.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY - getVennLPS.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION - getVennLPS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.lowpass.sites.vcf") - - add(getVennLPS) - - var getG1KOverlap : GetOverlapSamples = new GetOverlapSamples - getG1KOverlap.vcf1 = g1k_exome_calls - getG1KOverlap.vcf2 = g1k_lowpass_chr(1) // EUR only - getG1KOverlap.samples = new File(LIST_DIR+"g1k_EUR_exome_lowpass_overlap.txt") - - add(getG1KOverlap) - - //callOverlaps(getG1KOverlap.samples,exome_single.out_list, g1k_exome_calls) - - } - - /* - * defined modules go here - */ - - class Vcf2List extends CommandLineFunction { - @Input(doc="The vcf file to convert to list",required=true) - var in_vcf: File = _ - @Argument(doc="An egrep-based filter to apply during conversion",required=false) - var filter: String = "PASS" - @Output(doc="The list file to write to", required=true) - var out_list: File = _ - - def commandLine = { - "egrep %s %s | awk '{print $1\":\"$2}' > %s".format(filter,in_vcf.getAbsolutePath,out_list.getAbsolutePath) - } - } - - class GetOverlapSamples extends CommandLineFunction { - @Input(doc="vcf1") - var vcf1: File = _ - @Input(doc="vcf2") - var vcf2: File = _ - @Output(doc="sample file") - var samples: File = _ - - def commandLine = { - "head -n 500 %s %s | grep \\\\#CHR | cut -f10- | tr '\\t' '\\n' | sort | uniq -c | awk '{if ($1 == 2) print $2}' > %s".format( - vcf1.getAbsolutePath, - vcf2.getAbsolutePath, - samples.getAbsolutePath - ) - } - } - - class RemoveIntersect extends CommandLineFunction { - @Input(doc="The vcf file whose unique entries should be retained",required=true) - var fileToUnique: File = _ - @Input(doc="The vcf file whose entries overlapping the unique file you would like to remove",required=true) - var comparisonFile: File = _ - @Output(doc="The file containing only the fileToUnique-unique entries", required=true) - var outputFile: File = _ - - def commandLine = { - var tmpFile: File = java.io.File.createTempFile("removeintersect","tmp") - - "cat %s %s | sort | uniq -c | awk '{if ($1==2) print $2}' > %s ; cat %s %s | sort | uniq -c | awk '{if ($1 == 1) print $2}' > %s".format( - fileToUnique.getAbsolutePath,comparisonFile.getAbsolutePath, tmpFile.getAbsolutePath, - tmpFile.getAbsolutePath, fileToUnique.getAbsolutePath, outputFile.getAbsolutePath - ) - } - } - - class ExtractSites(inVCF: File, oVCF: File) extends CommandLineFunction { - @Input(doc="The vcf file to generate a sites only file from",required=true) - var inputVCF: File = inVCF - @Output(doc="The sites-only vcf to write to") - var outputVCF: File = oVCF - - def commandLine = { - "cut -f1-9 %s > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath) - } - } - - class ConcatVCF(in: List[File], out: File) extends CommandLineFunction { - @Input(doc="The files to concatenate",required=true) - var inputVCFs : List[File] = in - @Output(doc="The file to write to", required=true) - var outputVCF: File = out - - def commandLine = { - var header : File = java.io.File.createTempFile("concatVCF","header.tmp") - var body_unsorted : File = java.io.File.createTempFile("concatVCF","body.unsorted.tmp") - var body : File = java.io.File.createTempFile("concatVCF","body.tmp") - - "grep \\\\# %s > %s ; grep -v \\\\# %s | sed 's/.vcf:/\\t/g' | cut -f2- > %s ; perl %s -tmp . %s %s > %s ; cat %s %s > %s".format( - inputVCFs(0).getAbsolutePath, - header.getAbsolutePath, - inputVCFs.foldLeft[String]("")((x1: String, x2: File ) => x1 + " " + x2.getAbsolutePath ), - body_unsorted.getAbsolutePath, - "/humgen/gsa-scr1/chartl/sting/perl/sortByRef.pl", - body_unsorted.getAbsolutePath, - "/humgen/1kg/reference/human_g1k_v37.fasta.fai", - body.getAbsolutePath, - header.getAbsolutePath, - body.getAbsolutePath, - outputVCF.getAbsolutePath - ) - } - } - - class ExtractSample(vcfIn: File, sample: String, vcfOut: File) extends CommandLineFunction { - @Input(doc="File from which to extract sample") - var inVCF: File = vcfIn - @Argument(doc="The sample to extract") - var inSample: String = sample - @Output(doc="The VCF file to write to") - var outVCF: File = vcfOut - - def commandLine = { - "head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | grep -n %s | tr ':' '\\t' | cut -f1 | xargs -i cut -f1-9,\\{\\} %s | egrep \"\\\\#|PASS.*0/1|PASS.*1/0|PASS.*1/1\" > %s".format( - inVCF.getAbsolutePath, inSample, inVCF.getAbsolutePath, outVCF.getAbsolutePath - ) - } - } - - class GetVariantDepths extends CommandLineFunction { - @Input(doc="The VCF to get the per-sample depth at variant calls") - var inVCF : File = _ - @Output(doc="The depth file to write to") - var outFile : File = _ - - def commandLine = { - "grep PASS %s | cut -f10- | tr '\\t' '\\n' | egrep \"1/0|0/1|1/1|1\\|0|0\\|1|1\\|1\" | tr ':' '\\t' | awk '{print $3}' > %s".format( - inVCF.getAbsolutePath, outFile.getAbsolutePath - ) - } - } - - def combineAndMergeSites(chrInputs: List[File], output: File) = { - var sites_only: List[ExtractSites] = chrInputs.map( f => new ExtractSites(f,new File(VCF_DIR+swapExt(f,".vcf",".sites.vcf").getName))) - for ( s <- sites_only ) { - add(s) - } - var trivialMerge : ConcatVCF = new ConcatVCF(sites_only.map( s => s.outputVCF ), output) - add(trivialMerge) - } - - def liftoverVCF(b36vcf: File, b37vcf: File) = { - - class Liftover(in: File, out: File) extends CommandLineFunction { - @Input(doc="foo") - val input: File = in - @Output(doc="foo") - val output : File = out - - val sting : String = "/humgen/gsa-scr1/chartl/sting/" - val chain : String = "/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/b36ToHg19.broad.over.chain" - - def commandLine = { - "%s/perl/liftOverVCF.pl -vcf %s -chain %s -out %s -gatk %s -newRef %s -oldRef %s -tmp .".format( - sting, input, chain, output, sting, "/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19", - "/humgen/1kg/reference/human_b36_both" - ) - } - - class RMDupes(in: File, out:File) extends CommandLineFunction { - @Input(doc="foo") - var input : File = in - @Output(doc="foo") - var output : File = out - - def commandLine = "cat %s | python /humgen/gsa-scr1/chartl/projects/omni/scripts/filterDupes.py > %s".format( - input.getAbsolutePath,output.getAbsolutePath - ) - } - - var lift : Liftover = new Liftover(b36vcf,swapExt(b37vcf,".vcf",".undeduped.vcf")) - - add(lift) - - var rmDupes : RMDupes = new RMDupes(lift.output,b37vcf) - - add(rmDupes) - } - } - - def callOverlaps(samples: File, intervals: File, wexCalls: File) = { - class GetBamList(sample: String, oList: File) extends CommandLineFunction { - @Argument(doc="foo") - var inSample = sample - @Output(doc="foo") - var outList = oList - @Input(doc = "foo") - var waitForMe: File = _ - - def commandLine = { - "grep %s /humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/*.list | tr ':' '\\t' | awk '{print $2}' | sort | uniq > %s".format( - inSample, - outList.getAbsolutePath - ) - } - } - for ( s <- scala.io.Source.fromFile(samples).getLines ) { - var extract : ExtractSample = new ExtractSample(wexCalls,s,new File(VCF_DIR+"g1k_exome.subset.%s.vcf".format(s))) - add(extract) - - var sites : Vcf2List = new Vcf2List - sites.in_vcf = extract.outVCF - sites.out_list = new File(SCRATCH_DIR+"g1k_exome.subset.%s.variants.list".format(s)) - add(sites) - - var bamList : GetBamList = new GetBamList(s,new File(SCRATCH_DIR+"%s.bams.list".format(s))) - bamList.waitForMe = sites.out_list - - add(bamList) - - var genotype : UnifiedGenotyper = new UnifiedGenotyper - genotype.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - genotype.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - genotype.intervals :+= sites.out_list - genotype.out = new File(VCF_DIR+"g1k_lowpass.%s.exome_sites.vcf".format(s)) - genotype.input_file :+= bamList.outList - genotype.memoryLimit = 3 - genotype.output_all_callable_bases = true - - add(genotype) - - } - } - -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/chartl/omni_qc.q b/scala/qscript/oneoffs/chartl/omni_qc.q deleted file mode 100755 index 4f4839f83..000000000 --- a/scala/qscript/oneoffs/chartl/omni_qc.q +++ /dev/null @@ -1,505 +0,0 @@ -import java.io.{FileReader, File, BufferedReader} -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import scala.collection.mutable.HashMap - -class omni_qc extends QScript { - qscript => - - // NON-OMNI VCF FILES - var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf") - var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf") - var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf") - var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf") - var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf") - var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf") - var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf") - var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/EUR.beagle.vcf.gz","vcf") - var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/ASN.beagle.vcf.gz","vcf") - var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/AFR.beagle.vcf.gz","vcf") - var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") - var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf") - var pilot1_na12878_beagle = new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/beagle/lowpass.N60.recal.CEUTSI.bgl.output.vcf") - //var august_calls_other_genotypes = _ - - // OMNI VCF FILES - var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf") - var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf") - var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf") - var OMNI_b36_panel_vcf = new TaggedFile("/broad/shptmp/chartl/omni/vcfs/Omni_b36_with_panel_sets.vcf","vcf") - var OMNI_b37_birdseed = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_birdseed_only.vcf") - var OMNI_b37_joint = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_joint_birdseed_lowpass.vcf") - - // INTERVALS - var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" - var pilot1_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" - var hiseq_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" - var production_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b37_sites.chr20.interval.list" - - // REFERENCES - var b36_ref = new File("/humgen/1kg/reference/human_b36_both.fasta") - var b37_ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - // OTHER - val analysis_dir = "/broad/shptmp/chartl/omni/" - val resources_dir = analysis_dir + "resources/" - val scratch_dir = analysis_dir + "scratch/" - val eval_dir = analysis_dir + "eval/" - val vcf_dir = analysis_dir + "vcfs/" - val p1_ceu_only = scratch_dir+"Pilot1_CEU_only_sites.intervals.list" - val p1_chbjpt_only = scratch_dir+"Pilot1_CHBJPT_only_sites.intervals.list" - val p1_yri_only = scratch_dir+"Pilot1_YRI_only_sites.intervals.list" - - // OTHER CHIPS - - val OMNI_QUAD_1KG = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_OMNI.ref_fixed.vcf") - val AFFY_6_0 = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_ARRAY.ref_fixed.vcf") - - trait OmniArgs extends CommandLineGATK { - this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") - } - - class vcf2bed extends CommandLineFunction { - @Input(doc="A VCF file to be put into an interval list") var in_vcf: File = _ - @Output(doc="An interval list file to be used with -L") var out_list: File = _ - - def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/vcf2bed.py %s %s".format(in_vcf.getAbsolutePath,out_list.getAbsolutePath) - } - - class GetSampleOverlap extends CommandLineFunction { - @Input(doc="A list of VCF files for which to calculate the sample overlap") var in_vcfs: List[File] = Nil - @Output(doc="A file to which to write the overlapping sample names") var outFile: File = _ - - /*def commandLine = "grep #CHR %s | sed 's/.vcf:/\\t/g' | cut -f11- | tr '\\t' '\\n' | sort | uniq -c | awk '$1 == %d' | awk '{print $2}' > %s".format( - in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), - in_vcfs.size, - outFile.getAbsolutePath - )*/ - def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/getOverlapSamples.py %s %s".format( - in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), - outFile.getAbsolutePath - ) - } - - class GunzipFile extends CommandLineFunction { - @Input(doc="file to gunzip") var gunzipMe: File = _ - @Output(doc="file to gunzip to") var outFile: File = _ - - def commandLine = "gunzip -c %s > %s".format(gunzipMe.getAbsolutePath,outFile.getAbsolutePath) - } - - def script = { - - /** Convert other chips to merged VCFs **/ - - //var august_call_other_chips: List[(String,File)] = processAuxiliaryChipData(august_calls_other_genotypes) - - - /** Unzip the pilot 1 VCFs and dump them into the resources directory **/ - - var gunzip_p1_ceu = new GunzipFile - var gunzip_p1_chb = new GunzipFile - var gunzip_p1_yri = new GunzipFile - var gunzip_hiseq = new GunzipFile - var gunzip_ag_eur = new GunzipFile - var gunzip_ag_asn = new GunzipFile - var gunzip_ag_afr = new GunzipFile - - gunzip_p1_ceu.gunzipMe = pilot1_ceu_vcf - gunzip_p1_ceu.outFile = new File(resources_dir+"CEU.low_coverage.genotypes.vcf") - gunzip_p1_chb.gunzipMe = pilot1_chb_vcf - gunzip_p1_chb.outFile = new File(resources_dir+"CHB.low_coverage.genotypes.vcf") - gunzip_p1_yri.gunzipMe = pilot1_yri_vcf - gunzip_p1_yri.outFile = new File(resources_dir+"YRI.low_coverage.genotypes.vcf") - gunzip_hiseq.gunzipMe = hiseq_calls_vcf - gunzip_hiseq.outFile = new File(resources_dir+"HiSeq.b36.vcf") - gunzip_ag_eur.gunzipMe = august_calls_EUR_refined - gunzip_ag_eur.outFile = new File(resources_dir+"EUR.refined.vcf") - gunzip_ag_asn.gunzipMe = august_calls_ASN_refined - gunzip_ag_asn.outFile = new File(resources_dir+"ASN.refined.vcf") - gunzip_ag_afr.gunzipMe = august_calls_AFR_refined - gunzip_ag_afr.outFile = new File(resources_dir+"AFR.refined.vcf") - - add(gunzip_p1_ceu,gunzip_p1_yri,gunzip_p1_chb,gunzip_hiseq,gunzip_ag_eur,gunzip_ag_asn,gunzip_ag_afr) - - /** fix the omni ref bases **/ - var fix_421 = new FixRefBases with OmniArgs - var fix_764 = new FixRefBases with OmniArgs - var fix_764_b37 = new FixRefBases with OmniArgs - - fix_421.variantVCF = OMNI_hapmap_b36_vcf - fix_421.reference_sequence = b36_ref - fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf")) - fix_421.bypassException = true - fix_764.variantVCF = OMNI_b36_vcf - fix_764.reference_sequence = b36_ref - fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf")) - fix_764.bypassException = true - fix_764_b37.variantVCF = OMNI_b37_vcf - fix_764_b37.reference_sequence = b37_ref - fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf")) - fix_764_b37.bypassException = true - - add(fix_421,fix_764,fix_764_b37) - - /** Propagate AC/AN annotations to Omni files via variant annotator **/ - var annotate_421 = new VariantAnnotator with OmniArgs - var annotate_764 = new VariantAnnotator with OmniArgs - var annotate_764_b37 = new VariantAnnotator with OmniArgs - - annotate_421.variantVCF = OMNI_hapmap_b36_vcf - annotate_421.reference_sequence = b36_ref - annotate_421.annotation :+= "ChromosomeCounts" - annotate_421.out = new File(vcf_dir+swapExt(annotate_421.variantVCF.getName,".vcf",".annot.vcf")) - annotate_764.variantVCF = OMNI_b36_vcf - annotate_764.reference_sequence = b36_ref - annotate_764.annotation :+= "ChromosomeCounts" - annotate_764.out = new File(vcf_dir+swapExt(annotate_764.variantVCF.getName,".vcf",".annot.vcf")) - annotate_764_b37.variantVCF = OMNI_b37_vcf - annotate_764_b37.reference_sequence = b37_ref - annotate_764_b37.annotation :+= "ChromosomeCounts" - annotate_764_b37.out = new File(vcf_dir+swapExt(annotate_764_b37.variantVCF.getName,".vcf",".annot.vcf")) - - add(annotate_421,annotate_764,annotate_764_b37) - - /** Eval the omni chip against the various comps **/ - runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref) - runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref,true) - //runEval(OMNI_hapmap_b36_vcf,gunzip_p1_ceu.outFile,"OMNI_421_Unfixed","Pilot1_CEU",pilot1_interval_list,b36_ref) - runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref) - runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref) - runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref) - runEval(annotate_421.out,gunzip_p1_yri.outFile,"OMNI_421","Pilot1_YRI",pilot1_interval_list, b36_ref) - runEval(annotate_764.out,pilot3_release_vcf,"OMNI_764","Pilot3",pilot3_interval_list, b36_ref) - runEval(annotate_421.out,pilot3_release_vcf,"OMNI_421","Pilot3",pilot3_interval_list, b36_ref) - runEval(annotate_764_b37.out,gunzip_ag_eur.outFile,"OMNI_764","August_EUR",production_interval_list, b37_ref) - runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref) - runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref) - runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref) - runEval(annotate_764.out,annotate_421.out,"OMNI_764","OMNI_421_FIXED",pilot1_interval_list,b36_ref) - runEval(annotate_764.out,OMNI_QUAD_1KG,"OMNI_764","OMNI_QUAD",pilot1_interval_list,b36_ref) - runEval(annotate_764.out,AFFY_6_0,"OMNI_764","AFFY_6_0",pilot1_interval_list,b36_ref) - runEval(OMNI_b37_birdseed,gunzip_ag_eur.outFile,"OMNI_birdseed","August_EUR",production_interval_list,b37_ref) - runEval(OMNI_b37_joint,gunzip_ag_eur.outFile,"OMNI_joint","August_EUR",production_interval_list,b37_ref) - runEval(OMNI_QUAD_1KG,gunzip_p1_ceu.outFile,"OMNI_QUAD_1KG","Pilot1_CEU",pilot1_interval_list,b36_ref) - runEval(AFFY_6_0,gunzip_p1_ceu.outFile,"AFFY_6_0","Pilot1_CEU",pilot1_interval_list,b36_ref) - - var eval1KG_exclude = new VariantEval with OmniArgs - eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt" - eval1KG_exclude.rodBind :+= new RodBind("evalOMNI_764","VCF",annotate_764.out) - eval1KG_exclude.rodBind :+= new RodBind("compPilot3","VCF",pilot3_release_vcf) - eval1KG_exclude.evalModule :+= "GenotypeConcordance" - eval1KG_exclude.evalModule :+= "SimpleMetricsBySample" - eval1KG_exclude.reference_sequence = b36_ref - eval1KG_exclude.reportType = VE2TemplateType.CSV - eval1KG_exclude.intervalsString :+= pilot3_interval_list - eval1KG_exclude.out = new File(eval_dir+"%s_vs_%s.%s".format("OMNI_764","Pilot3","exclude.mixups.eval.csv")) - - add(eval1KG_exclude) - - runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile) - - var subset421: SelectVariants = new SelectVariants with OmniArgs - subset421.reference_sequence = b36_ref - subset421.sample :+= (new File(scratch_dir+"OMNI_421_vs_Pilot1_CEU.sample_overlap.txt")).getAbsolutePath - subset421.variantVCF = annotate_764.out - subset421.out = new File(vcf_dir+swapExt(annotate_764.out.getName,".vcf",".subset.pilot1CEU.vcf")) - - add(subset421)// lastly to find things in the three-way pilot venn - - var combine: CombineVariants = new CombineVariants with OmniArgs - combine.reference_sequence = b36_ref - combine.rodBind :+= new RodBind("CEU","VCF",gunzip_p1_ceu.outFile) - combine.rodBind :+= new RodBind("ASN","VCF",gunzip_p1_chb.outFile) - combine.rodBind :+= new RodBind("YRI","VCF",gunzip_p1_yri.outFile) - combine.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY - combine.priority = "%s,%s,%s".format("CEU","ASN","YRI") - combine.out = new File(vcf_dir+"Pilot1_Populations_Combined.vcf") - - add(combine) - - selectSites(OMNI_b36_panel_vcf,p1_ceu_only,"ceu_only_sites") - selectSites(OMNI_b36_panel_vcf,p1_chbjpt_only,"chbjpt_only_sites") - selectSites(OMNI_b36_panel_vcf,p1_yri_only,"yri_only_sites") - - runBeagleAnalysis(new File(vcf_dir + "Illumina_HapMap_Omni_2.5_764samples.annot.stripped.vcf")) - - } - - def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = { - // todo ==== me - return Nil - } - - def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File, interesting: Boolean = false) = { - var base = "%s_vs_%s".format(eBase,cBase) - var getOverlap = new GetSampleOverlap - getOverlap.in_vcfs :+= eval - getOverlap.in_vcfs :+= comp - getOverlap.outFile = new File(scratch_dir+base+".sample_overlap.txt") - add(getOverlap) - - var vEval = new VariantEval with OmniArgs - vEval.samples :+= getOverlap.outFile.getAbsolutePath - vEval.rodBind :+= new RodBind("eval"+eBase,"VCF",eval) - vEval.rodBind :+= new RodBind("comp"+cBase,"VCF",comp) - vEval.evalModule :+= "GenotypeConcordance" - vEval.evalModule :+= "SimpleMetricsBySample" - vEval.intervalsString :+= intervals - vEval.reference_sequence = reference - vEval.reportType = VE2TemplateType.CSV - - vEval.out = new File(eval_dir+base+".eval.csv") - - if ( interesting ) { - vEval.discordantInteresting = true - vEval.outputVCF = new File(vcf_dir+"%s_vs_%s.interesting_sites.vcf".format(eBase,cBase)) - } - - add(vEval) - - } - - def swapExt(s: String, d: String, f: String) : String = { - return s.stripSuffix(d)+f - } - - def runAFComparison(omni: File, p1ceu: File, p1asn: File, p1yri:File ) : Boolean = { - // step one, set up some of the info - var populations : List[String] = Nil // these are the pilot 1 populations - populations :+= "CEU" - populations :+= "CHBJPT" - populations :+= "YRI" - var panels : List[String] = Nil // these are the analysis panels - panels :+= "EUR" - panels :+= "ASN" - panels :+= "ASW" - panels :+= "AFR" - panels :+= "ADM" - // step two -- subset the OMNI chip to the actual sample names - var nameToSubset: HashMap[String,SelectVariants] = new HashMap[String,SelectVariants] - for ( p <- populations ) { - nameToSubset += p -> sampleSubset(p,omni) - } - - for ( p <- panels ) { - nameToSubset += p -> sampleSubset(p,omni) - } - - // step three -- compare the pilot 1 files against all populations and panels - - runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out) - runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out) - runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out) - runComps("Pilot1CHBJPT",p1asn,"ASN",nameToSubset("ASN").out) - runComps("Pilot1YRI",p1yri,"YRI",nameToSubset("YRI").out) - runComps("Pilot1YRI",p1yri,"AFR",nameToSubset("AFR").out) - runComps("EUR",nameToSubset("EUR").out,"AFR",nameToSubset("AFR").out) - runComps("EUR",nameToSubset("EUR").out,"ASN",nameToSubset("ASN").out) - runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out) - runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out) - - - var panelCombine: CombineVariants = new CombineVariants with OmniArgs - panelCombine.reference_sequence = b36_ref - panelCombine.priority = "" - for ( p <- panels ) { - panelCombine.rodBind :+= new RodBind(p,"VCF",nameToSubset(p).out) - panelCombine.priority = if ( panelCombine.priority.equals("") ) p else panelCombine.priority + "," + p - } - panelCombine.out = OMNI_b36_panel_vcf - panelCombine.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE - panelCombine.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION - panelCombine.setKey = "panel" - - add(panelCombine) - return true - - } - - def getOmniSampleListByPanel(panel: String) : String = { - return scratch_dir+"OMNI_764_%s.txt".format(panel) - } - - def sampleSubset(panel: String, omni: File) : SelectVariants = { - var sv : SelectVariants = new SelectVariants with OmniArgs - sv.reference_sequence = b36_ref - sv.variantVCF = omni - sv.sample :+= getOmniSampleListByPanel(panel) - sv.out = new File(vcf_dir+swapExt(omni.getName,".vcf",".%s.vcf".format(panel))) - add(sv) - return sv - } - - def runComps(eBase: String, evalVCF: File, cBase: String, compVCF: File) = { - var eval: VariantEval = new VariantEval with OmniArgs - eval.reference_sequence = b36_ref - eval.rodBind :+= new RodBind("eval%s".format(eBase),"VCF",evalVCF) - eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF) - eval.noStandard = true - eval.E :+= "AlleleFrequencyComparison" - eval.reportType = VE2TemplateType.CSV - eval.out = new File(eval_dir+"%s_vs_%s_allele_frequency.eval".format(eBase,cBase)) - - add(eval) - - var combine: CombineVariants = new CombineVariants with OmniArgs - combine.reference_sequence = b36_ref - combine.rodBind :+= new RodBind(eBase,"VCF",evalVCF) - combine.rodBind :+= new RodBind(cBase,"VCF",compVCF) - combine.out = new File(vcf_dir+"%s_plus_%s.vcf".format(eBase,cBase)) - combine.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY - combine.priority = "%s,%s".format(eBase,cBase) - - //add(combine) - - } - - def selectSites(vcf: File, intervals: String, base: String) { - var sv = new SelectVariants with OmniArgs - sv.reference_sequence = b36_ref - sv.variantVCF = vcf - sv.out = swapExt(vcf,".vcf",base+".vcf") - sv.intervalsString :+= intervals - - add(sv) - } - - def runBeagleAnalysis(omnivcf: File) { - var combine : CombineVariants = new CombineVariants with OmniArgs - combine.reference_sequence = b36_ref - for ( c <- 1 until 23) { - combine.rodBind :+= new RodBind("beagle%s".format(c),"VCF",runBeagle(omnivcf,"%s".format(c))) - if ( c > 1 ) { - combine.priority = combine.priority+",%s%s".format("beagle",c) - } else { - combine.priority = "%s%s".format("beagle",c) - } - } - combine.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.PRIORITIZE - combine.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION - - combine.out = swapExt(pilot1_with_na12878_vcf,".vcf",".beagle_refined_with_omni.vcf") - - add(combine) - - var select : SelectVariants = new SelectVariants with OmniArgs - select.reference_sequence = b36_ref - select.variantVCF = combine.out - select.sample :+= "NA12878" - select.out = new File(vcf_dir + "NA12878.lowpass.beagle.refined.with.pilot1.vcf") - - add(select) - - var eval : VariantEval = new VariantEval with OmniArgs - eval.reference_sequence = b36_ref - eval.rodBind :+= new RodBind("evalNA12878LowPass","VCF",select.out) - eval.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) - eval.E :+= "GenotypeConcordance" - eval.out = new File(eval_dir+"NA12878.lowpass.beagle.vs.HiSeq.eval") - eval.excludeIntervals :+= new File(pilot1_interval_list) - eval.reportType = VE2TemplateType.CSV - - add(eval) - - var eval2: VariantEval = new VariantEval with OmniArgs - eval2.reference_sequence = b36_ref - eval2.rodBind :+= new RodBind("evalNA12878Beagle","VCF",pilot1_na12878_beagle) - eval2.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) - eval2.E :+= "GenotypeConcordance" - eval2.sample :+= "NA12878" - eval2.out = new File(eval_dir+"NA12878.lowpass.nochip.vs.Hiseq.eval") - eval2.excludeIntervals :+= new File(pilot1_interval_list) - eval2.reportType = VE2TemplateType.CSV - - add(eval2) - - var eval3: VariantEval = new VariantEval with OmniArgs - eval3.reference_sequence = b36_ref - eval3.rodBind :+= new RodBind("evalNA12878NoBeagle","VCF",pilot1_with_na12878_vcf) - eval3.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) - eval3.E :+= "GenotypeConcordance" - eval3.sample :+= "NA12878" - eval3.out = new File(eval_dir+"NA12878.lowpass.nochip.norefined.vs.Hiseq.eval") - eval3.excludeIntervals :+= new File(pilot1_interval_list) - eval3.reportType = VE2TemplateType.CSV - - add(eval3) - } - - def runBeagle(omnivcf: File, chr: String): File = { - var beagleInput = new ProduceBeagleInput with OmniArgs - beagleInput.reference_sequence = b36_ref - beagleInput.intervalsString :+= chr - beagleInput.variantVCF = pilot1_with_na12878_vcf - beagleInput.rodBind :+= new RodBind("validation","VCF",omnivcf) - beagleInput.validation_genotype_ptrue = 0.99 - beagleInput.out = new File(scratch_dir+"/"+swapExt(beagleInput.variantVCF.getName,".vcf",".%s.beagle".format(chr))) - println (beagleInput.out.getAbsolutePath) - - var runBeagle : BeagleRefinement = new BeagleRefinement - runBeagle.beagleInput = beagleInput.out - runBeagle.beagleOutputBase = "Pilot1_NA12878_Beagle_with_OMNI_chr%s".format(chr) - runBeagle.beagleMemoryGigs = 6 - runBeagle.memoryLimit = 6 - runBeagle.beagleOutputDir = "" - runBeagle.freezeOutputs - - var gunzipPhased = new GunzipFile - gunzipPhased.gunzipMe = runBeagle.beaglePhasedFile - gunzipPhased.outFile = new File(scratch_dir+swapExt(runBeagle.beaglePhasedFile.getName,".gz","")) - - var gunzipLike = new GunzipFile - gunzipLike.gunzipMe = runBeagle.beagleLikelihoods - gunzipLike.outFile = new File(scratch_dir+swapExt(runBeagle.beagleLikelihoods.getName,".gz","")) - - - var convertBack : BeagleOutputToVCF = new BeagleOutputToVCF with OmniArgs - convertBack.reference_sequence = b36_ref - convertBack.variantVCF = pilot1_with_na12878_vcf - convertBack.intervalsString :+= chr - convertBack.rodBind :+= new RodBind("beagleR2","beagle",runBeagle.beagleRSquared) - convertBack.rodBind :+= new RodBind("beagleProbs","beagle",gunzipLike.outFile) - convertBack.rodBind :+= new RodBind("beaglePhased","beagle",gunzipPhased.outFile) - convertBack.out = new File(vcf_dir+swapExt(pilot1_with_na12878_vcf.getName,".vcf",".chr%s.beagle_refined_plus_omni.vcf".format(chr))) - - add(beagleInput,runBeagle,gunzipPhased,gunzipLike,convertBack) - - return convertBack.out - } - - class BeagleRefinement extends CommandLineFunction { - @Input(doc="The beagle input file") var beagleInput: File = _ - var beagleOutputBase: String = _ - var beagleMemoryGigs: Int = 4 - - /** - * Note: These get set - */ - @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ - @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ - @Output(doc="The beagle r2 file") var beagleRSquared: File = _ - var beagleOutputDir: String = _ - - def freezeOutputs = { - if ( beagleOutputDir == null && beagleInput.getParent == null ) { - beagleOutputDir = "" - } else if ( beagleOutputDir == null ) { - beagleOutputDir = beagleInput.getParent+"/" - } - beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz") - beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") - beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2") - } - - def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar /humgen/gsa-hpprojects/software/beagle/beagle.jar like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleInput.getAbsolutePath,beagleOutputBase) - } -} diff --git a/scala/qscript/oneoffs/chartl/private_mutations.q b/scala/qscript/oneoffs/chartl/private_mutations.q deleted file mode 100755 index cea08b006..000000000 --- a/scala/qscript/oneoffs/chartl/private_mutations.q +++ /dev/null @@ -1,89 +0,0 @@ -import collection.JavaConversions._ -import java.io.FileNotFoundException -import org.broadinstitute.sting.datasources.pipeline._ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.library.ipf.vcf._ -import org.broadinstitute.sting.queue.pipeline._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class private_mutations extends QScript { - @Argument(shortName="yaml",fullName="eomiYaml",doc="Project YAML file",required=true) var eomiYaml: File = _ - @Argument(shortName="sting",fullName="stingDir",doc="path to the Sting directory",required=true) var sting: String = _ - @Argument(shortName="out",fullName="finalVCF",doc="the merged vcf to write to", required=true) var finalMergedVCF : File = _ - @Argument(shortName="mask",fullName="indelAndSVMask",doc="The indel/SV mask to apply during filtration",required=true) var filtMask : File = _ - - var gatkjar : File = _ - def script = { - gatkjar = new File(sting+"/dist/GenomeAnalysisTK.jar") - var input_pipeline : Pipeline = YamlUtils.load(classOf[Pipeline],eomiYaml) - var eomi_pipeline : Pipeline = new Pipeline - // use only QC-positive samples - eomi_pipeline.setProject( input_pipeline.getProject ) - eomi_pipeline.setSamples( input_pipeline.getSamples.filter( p => p.getTags.get("QCStatus").equals("PASS")) ) - var vcLib : VariantCalling = new VariantCalling(eomi_pipeline,gatkjar) - var pmLib : ProjectManagement = new ProjectManagement(sting) - - /*eomi_pipeline.getSamples.foreach( p => - if ( ! p.getBamFiles.get("recalibrated").exists) throw new FileNotFoundException( - p.getBamFiles.get("recalibrated").getAbsolutePath+" does not exist" ))*/ - - var batches : List[List[PipelineSample]] = eomi_pipeline.getSamples.toList.grouped(100).toList - var genotypers : List[UnifiedGenotyper] = batches.map( pl => pl.map( p => p.getBamFiles.get("recalibrated") ) ).zipWithIndex.map( - b => vcLib.StandardUnifiedGenotyper(b._1,new File(eomi_pipeline.getProject.getName+"_batch%d.raw.vcf".format(1+b._2)))) - addAll(genotypers) - - var handFilters : List[VariantFiltration] = genotypers.map( g => vcLib.StandardHandfilter(g.out,swapExt(g.out,".raw.vcf",".handfiltered.vcf"))) - handFilters.foreach( p => { p.rodBind :+= new RodBind("mask","bed",filtMask) - p.mask = "NearIndelOrSV"} ) - - addAll(handFilters) - - addAll(pmLib.MergeBatches(handFilters.map( _.out), batches.flatten.map( p => p.getBamFiles.get("recalibrated")), - finalMergedVCF,eomi_pipeline.getProject.getReferenceFile,20)) - - var afr_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("AFRAMR")) - var eur_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("EURAMR") || - p.getTags.get("Population").equals("UNK")) - - var variant_loci : VCFExtractIntervals = new VCFExtractIntervals(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".intervals.list"),false) - - add(variant_loci) - - var extract_afr : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".afr.vcf"),afr_sams.map(_.getId)) - var extract_eur : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".eur+unk.vcf"),eur_sams.map(_.getId)) - - add(extract_afr) - add(extract_eur) - - var eval_all : VariantEval = vcLib.addTrait(new VariantEval) - eval_all.rodBind :+= new RodBind("evalEOMI","vcf",finalMergedVCF) - eval_all.noStandard = true - eval_all.E :+= "ACTransitionTable" - eval_all.out = swapExt(finalMergedVCF,".vcf",".perm.csv") - //eval_all.reportType = org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV - - add(eval_all) - - var eval_afr : VariantEval = vcLib.addTrait(new VariantEval) - eval_afr.rodBind :+= new RodBind("evalAFR","VCF",extract_afr.outputVCF) - eval_afr.rodBind :+= new RodBind("compEUR","VCF",extract_eur.outputVCF) - eval_afr.E :+= "ACTransitionTable" - eval_afr.out = swapExt(extract_afr.outputVCF,".vcf",".perm.csv") - //eval_afr.reportType = org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV - eval_afr.noStandard = true - - add(eval_afr) - - var eval_eur : VariantEval = vcLib.addTrait(new VariantEval) - eval_eur.rodBind :+= new RodBind("compAFR","VCF",extract_afr.outputVCF) - eval_eur.rodBind :+= new RodBind("evalEUR","VCF",extract_eur.outputVCF) - eval_eur.E :+= "ACTransitionTable" - eval_eur.out = swapExt(extract_eur.outputVCF,".vcf",".perm.csv") - //eval_eur.reportType = org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV - eval_eur.noStandard = true - - add(eval_eur) - } - -} diff --git a/scala/qscript/oneoffs/delangel/Phase1IndelCalling.scala b/scala/qscript/oneoffs/delangel/Phase1IndelCalling.scala deleted file mode 100755 index 188aa74c4..000000000 --- a/scala/qscript/oneoffs/delangel/Phase1IndelCalling.scala +++ /dev/null @@ -1,161 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.commandline.ArgumentSource -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction} -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class Phase1Calling extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="the chromosome to process", shortName="chr", required=false) - var chr: Int = 20 - - @Input(doc="output path", shortName="outputDir", required=false) - var outputDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.calls.indels" - - @Input(doc="base output filename", shortName="baseName", required=false) - var baseName: String = "" - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) - var outputTmpDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" - - private val tmpDir: File = new File("/broad/shptmp/delangel/tmp/") - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - private val dbSNPIndels: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" - private val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/AFR.dindel_august_release.20110110.sites.vcf.gz" - private val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/ASN.dindel_august_release.20110110.sites.vcf.gz" - private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" - private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed" - val hapmap = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" - val g1k = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf" - val omni = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - //val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","JPT","LWK","MXL","PUR","TSI","YRI") - //val populations = List("JPT","ASN","AMR") - val populations = List("EUR","AMR","ASN","AFR") - //val populations = List("FIN", "LWK") - private val intervals: String = "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals" - //val populations = List("ZZZ") // small set used for debugging - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = 3 - this.jobTempDir = qscript.tmpDir - this.jobQueue = "gsa"; - - //this.DBSNP = qscript.dbSNP - } - - def script = { - callThisChunk() // using scatter/gather capabilities of Queue so no need to for loop over 1Mb chunks of the chromosome - } - - def callThisChunk() = { - - val interval = "%d".format(qscript.chr) - for( population <- qscript.populations ) { - val baseName: String = qscript.outputDir + "/" + population + ".indels.phase1.chr" + qscript.chr.toString - var bamList: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/%s.phase1.chr%d.cleaned.bam".format(population, qscript.chr)) - if( population == "ASN" || population == "EUR" || population == "AFR" || population == "AMR" ) { - bamList = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/%s.chr%d.cleaned.list".format(population, qscript.chr)) - } - - val rawCalls = new File(baseName + ".raw.vcf") - val filteredCalls = new File(baseName + ".filtered.vcf") - val clusterFile = new File(baseName + ".omni.clusters") - val recalibratedCalls = new File(baseName + ".recal.vcf") - val tranchesFile = new File(baseName + ".ts.omni.tranches") - - var call = new UnifiedGenotyper with CommandLineGATKArgs - call.intervalsString ++= List(qscript.intervals) - call.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 - call.setupScatterFunction = { - case scatter: ScatterFunction => - scatter.commandDirectory = new File(qscript.outputDir+"IndelCalls/ScatterGather") - scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out") - } - call.setupCloneFunction = { - case (clone: CloneFunction, index: Int) => - clone.commandDirectory = new File(qscript.outputDir+"IndelCalls/ScatterGather/Scatter_%s".format(index)) - clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index)) - } - call.setupGatherFunction = { - case (gather: GatherFunction, source: ArgumentSource) => - gather.commandDirectory = new File(qscript.outputDir+"IndelCalls/ScatterGather/Gather_%s".format(source.field.getName)) - gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) - } - - - call.dcov = 50 - call.stand_call_conf = 4.0 - call.stand_emit_conf = 4.0 - call.input_file :+= bamList - call.out = rawCalls - call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - call.analysisName = baseName + "_UG" - call.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNPIndels) - call.glm = GenotypeLikelihoodsCalculationModel.Model.DINDEL - - - var filter = new VariantFiltration with CommandLineGATKArgs - filter.intervalsString ++= List(qscript.intervals) - filter.scatterCount = 10 - filter.variantVCF = rawCalls - filter.out = filteredCalls - filter.filterName ++= List("HARD_TO_VALIDATE") - filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - filter.analysisName = baseName + "_VF" - //filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask) - //filter.maskName = "InDel" - - var gvc = new GenerateVariantClusters with CommandLineGATKArgs - //gvc.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) - gvc.rodBind :+= RodBind("1kg", "VCF", qscript.g1k) - gvc.rodBind :+= RodBind("input", "VCF", filteredCalls ) - gvc.clusterFile = clusterFile - gvc.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - gvc.analysisName = baseName + "_GVC" - gvc.intervalsString ++= List(qscript.intervals) - //gvc.qual = 100 // clustering parameters to be updated soon pending new experimentation results - //gvc.std = 4.5 - //gvc.mG = 6 - /* - var vr = new VariantRecalibrator with CommandLineGATKArgs - vr.rodBind :+= RodBind("1kg", "VCF", qscript.omni) - vr.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) - vr.rodBind :+= RodBind("truthOmni", "VCF", qscript.omni) - vr.rodBind :+= RodBind("truthHapMap", "VCF", qscript.hapmap) - vr.rodBind :+= RodBind("input", "VCF", filteredCalls ) - vr.clusterFile = clusterFile - vr.analysisName = baseName + "_VR" - vr.intervalsString ++= List(qscript.intervals) - vr.ignoreFilter ++= List("HARD_TO_VALIDATE") - vr.target_titv = 2.3 - vr.sm = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY - vr.tranche ++= List("0.1", "1.0", "2.0", "3.0", "5.0", "10.0", "100.0") - vr.out = recalibratedCalls - vr.priorDBSNP = 10.0 - vr.priorHapMap = 12.0 - vr.prior1KG = 12.0 - vr.tranchesFile = tranchesFile - - add(call, filter, gvc, vr) */ - add(call, filter) - } - - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/delangel/Phase1IndelProjectConsensus.scala b/scala/qscript/oneoffs/delangel/Phase1IndelProjectConsensus.scala deleted file mode 100755 index d989e7839..000000000 --- a/scala/qscript/oneoffs/delangel/Phase1IndelProjectConsensus.scala +++ /dev/null @@ -1,236 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType - -class Phase1ProjectConsensus extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="output path", shortName="outputDir", required=true) - var outputDir: String = _ - - @Input(doc="the chromosome to process", shortName="onlyChr20", required=false) - var onlyChr20: Boolean = false - - @Input(doc="the chromosome to process", shortName="indelsOnly", required=false) - var indelsOnly: Boolean = false - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true) - var outputTmpDir: String = "/broad/shptmp/delangel" - - @Input(doc="Generate bam files", shortName="generateBAMs", required=false) - var generateBAMs: Boolean = false - - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560) - val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI") - private val snpAlleles: String = "/humgen/1kg/processing/production_wgs_phase1/consensus/ALL.phase1.wgs.union.pass.sites.vcf" - private val indelAlleles: String = "//humgen/1kg/processing/production_wgs_phase1/consensus/ALL.indels.combined.chr20.vcf" - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = Some(3) - this.jobQueue = "gsa" - - } - - class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int, val chr: String) { - val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + chr + "/" + baseName + "/" + baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf") - val rawVCFindels = new File(qscript.outputDir + "/calls/chr" + chr + "/" + baseName + "/" + baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.indels.vcf") - - val callSnps = new UnifiedGenotyper with CommandLineGATKArgs - callSnps.out = rawVCFsnps - callSnps.dcov = 50 - callSnps.stand_call_conf = 4.0 - callSnps.stand_emit_conf = 4.0 - callSnps.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE - callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" +baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps" - callSnps.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP - callSnps.genotyping_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - //callSnps.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - callSnps.rodBind :+= RodBind("alleles", "VCF", qscript.snpAlleles) - callSnps.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - callSnps.sites_only = true - - val callIndels = new UnifiedGenotyper with CommandLineGATKArgs - callIndels.out = rawVCFindels - callIndels.dcov = 50 - callIndels.stand_call_conf = 4.0 - callIndels.stand_emit_conf = 4.0 - callIndels.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - callIndels.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" +baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.indels" - callIndels.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL - callIndels.genotyping_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - callIndels.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - callIndels.rodBind :+= RodBind("alleles", "VCF", qscript.indelAlleles) - callIndels.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - callIndels.sites_only = true - // indel specific stuff: add FisherStrand, add F-test statistic (excessive hets/homs) - - callIndels.A ++= List ("FisherStrand","GLstats") - } - - class Chromosome(val inputChr: Int) { - var chr: String = inputChr.toString - if(inputChr == 23) { chr = "X" } - - val snpCombine = new CombineVariants with CommandLineGATKArgs - val snpChrVCF = new File(qscript.outputDir + "/calls/" + "combined.phase1.chr" + chr + ".raw.snps.vcf") - snpCombine.out = snpChrVCF - snpCombine.intervalsString :+= chr - val indelCombine = new CombineVariants with CommandLineGATKArgs - val indelChrVCF = new File(qscript.outputDir + "/calls/" + "combined.phase1.chr" + chr + ".raw.indels.vcf") - indelCombine.out = indelChrVCF - indelCombine.intervalsString :+= chr - } - - def script = { - - var chrList = List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23) - if (qscript.onlyChr20) { - chrList = List(20) - } - for(chr <- chrList) { - val chrObject = new Chromosome(chr) - val basesPerJob: Int = 1000000 - val lastBase: Int = qscript.chromosomeLength(chr - 1) - var start: Int = 1 - var stop: Int = start - 1 + basesPerJob - if( stop > lastBase ) { stop = lastBase } - var jobNumber: Int = 1 - while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { - if( chr != 23 ) { - callThisChunk("%d:%d-%d".format(chr, start, stop), jobNumber, chr, chrObject) - } else { - callThisChunk("X:%d-%d".format(start, stop), jobNumber, chr, chrObject) - } - start += basesPerJob - stop += basesPerJob - if( stop > lastBase ) { stop = lastBase } - jobNumber += 1 - } - add(chrObject.indelCombine) - if (!qscript.indelsOnly) - add(chrObject.snpCombine) - } - } - - - def callThisChunk(interval: String, jobNumber: Int, inputChr: Int, chrObject: Chromosome) = { - - var chr: String = inputChr.toString - if(inputChr == 23) { chr = "X" } - - val AFRadmix = new AnalysisPanel("AFR.admix", List("LWK","YRI","ASW","CLM","PUR"), jobNumber, chr) - val AMRadmix = new AnalysisPanel("AMR.admix", List("MXL","CLM","PUR","ASW"), jobNumber, chr) - val EURadmix = new AnalysisPanel("EUR.admix", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber, chr) - val ASNadmix = new AnalysisPanel("ASN.admix", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber, chr) - val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW"), jobNumber, chr) - val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR"), jobNumber, chr) - val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS"), jobNumber, chr) - val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT"), jobNumber, chr) - val ALL = new AnalysisPanel("ALL", List("LWK","YRI","ASW","MXL","CLM","PUR","CEU","FIN","GBR","TSI","IBS","CHB","CHS","JPT"), jobNumber, chr) - - val analysisPanels = List(AFR, ASN, AMR, EUR, AFRadmix, ASNadmix, AMRadmix, EURadmix) //ALL - - val snpCombine = new CombineVariants with CommandLineGATKArgs - val indelCombine = new CombineVariants with CommandLineGATKArgs - val combinedIndelChunk = new File(qscript.outputDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.indels.vcf") - val combinedSnpChunk = new File(qscript.outputDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf") - - indelCombine.out = combinedIndelChunk - indelCombine.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.indels" - indelCombine.intervalsString :+= interval - indelCombine.mergeInfoWithMaxAC = true - indelCombine.priority = "AFR.admix,AMR.admix,EUR.admix,ASN.admix,AFR,AMR,EUR,ASN" //ALL, - - snpCombine.out = combinedSnpChunk - snpCombine.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps" - snpCombine.intervalsString :+= interval - snpCombine.mergeInfoWithMaxAC = true - snpCombine.priority = "AFR.admix,AMR.admix,EUR.admix,ASN.admix,AFR,AMR,EUR,ASN" //ALL, - - for( population <- qscript.populations ) { - val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + chr + "/" + population + ".phase1.chr" + chr + "." + jobNumber.toString + "." - val bamList: File = new File("/humgen/1kg/processing/production_wgs_phase1/bam_lists/%s.list".format(population)) - val targetIntervals: File = new File(baseTmpName + "target.intervals") - - // 1.) Create cleaning targets - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.memoryLimit = 4 - target.input_file :+= bamList - target.intervalsString :+= interval - target.out = targetIntervals - target.mismatchFraction = 0.0 - target.maxIntervalSize = 700 - target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - target.jobName = baseTmpName + "target" - //target.isIntermediate = true - target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - // 2.) Clean without SW - val clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + "cleaned.bam") - clean.memoryLimit = 6 - clean.input_file :+= bamList - clean.intervalsString :+= interval - clean.targetIntervals = targetIntervals - clean.out = cleanedBam - clean.doNotUseSW = true - clean.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - clean.simplifyBAM = true - clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - clean.jobName = baseTmpName + "clean" - //clean.isIntermediate = true - clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - if (qscript.generateBAMs) { - add(target, clean) - } - - for( a <- analysisPanels ) { - for( p <- a.pops) { - if( p == population ) { - a.callIndels.input_file :+= cleanedBam - a.callSnps.input_file :+= cleanedBam - } - } - } - } - - for( a <- analysisPanels ) { - a.callSnps.intervalsString :+= interval - a.callIndels.intervalsString :+= interval - if(a.baseName == "ALL") { - a.callIndels.memoryLimit = 4 - a.callSnps.memoryLimit = 4 - } - if (!qscript.indelsOnly) - add(a.callSnps) - add(a.callIndels) - - snpCombine.rodBind :+= RodBind(a.baseName, "VCF", a.callSnps.out) - indelCombine.rodBind :+= RodBind(a.baseName, "VCF", a.callIndels.out) - } - - add(indelCombine) - if (!qscript.indelsOnly) - add(snpCombine) - - chrObject.snpCombine.rodBind :+= RodBind("ALL" + jobNumber.toString, "VCF", snpCombine.out) - chrObject.indelCombine.rodBind :+= RodBind("ALL" + jobNumber.toString, "VCF", indelCombine.out) - } -} diff --git a/scala/qscript/oneoffs/delangel/Phase1IndelVQSR.scala b/scala/qscript/oneoffs/delangel/Phase1IndelVQSR.scala deleted file mode 100755 index cd2234ef6..000000000 --- a/scala/qscript/oneoffs/delangel/Phase1IndelVQSR.scala +++ /dev/null @@ -1,258 +0,0 @@ -import collection.SeqLike._ -import management.CompilationMXBean -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.IndelStatistics -import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.gatk.RodBind._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.QScript -import org.apache.commons.io.FilenameUtils -import scala.Some -; - -class Phase1IndelVQSR extends QScript { - qscript => - // todo -- update to released version when things stabilize - @Argument(shortName = "gatk",doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/humgen/gsa-scr1/delangel/Sting_dev/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false) - val TARGET_INTERVAL: String = "20" - - @Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false) - val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/" - @Argument(shortName = "baseDir", doc="Path to the standard evaluation data files", required=false) - val baseDir = "/humgen/gsa-scr1/delangel/VQSRIndels/data/" - - @Argument(shortName = "outDir", doc="Path to the output files", required=false) - val OUT_DIR = "/humgen/gsa-scr1/delangel/VQSRIndels" - - @Argument(shortName = "rawCalls", doc="VQSR raw input file", required=false) - var rawCalls: File = new File("/humgen/gsa-hpprojects/dev/delangel/Phase1Calls/20110525VQSRConsensus/calls/combined.phase1.chr20.raw.indels.vcf") - - @Argument(shortName = "truth", doc="VQSR truth file", required=false) - var truthFile: File = new File("/humgen/gsa-scr1/delangel/devine_data/indel_hg19_051711_leftAligned_75percent_chr20.vcf" ) - - @Argument(shortName = "training", doc="VQSR training file", required=false) - var trainingFile: File = new File("/humgen/gsa-scr1/delangel/devine_data/indel_hg19_051711_leftAligned_75percent_chr20.vcf" ) - - var noMultiallelicSites: Boolean = false; - - val populations = List("EUR","AMR","ASN","AFR") - - @Argument(shortName = "evalStandard1000GCalls", doc="If provided, we'll include some standard 1000G data for evaluation", required=false) - val EVAL_STANDARD_1000G_CALLS: Boolean = true - - @Argument(shortName = "numG", doc="If provided, we'll include some standard 1000G data for evaluation", required=false) - val numG: Int = 4 - - @Argument(shortName = "pctBad", doc="If provided, we'll include some standard 1000G data for evaluation", required=false) - val pctBad: Double = 0.05 - - @Argument(shortName = "runName", doc="Run Name", required=false) - val runName:String = "mills100" - val COMPS_DIR = DATA_DIR + "comps/" - val EVALS_DIR = DATA_DIR + "evals/" - - @Argument(shortName = "createAllPos", doc="If provided, create all POPS file", required=false) - val CREATE_ALL_POPS_FILE: Boolean = false - - @Argument(shortName = "pops", doc="Populations to do", required=false) - val moreIndelsToEval: List[String] = List("EUR","ASN","AFR","AMR") - - - val VARIANT_TYPES: List[String] = List("indels", "snps") - - val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( - "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), - "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) - ) - - val SITES_DIR: String = "sitesFiles" - - // path to b37 DBSNP - val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") - val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" - - - val DINDEL: String = "/humgen/gsa-scr1/delangel/officialCalls/20110201_chr20_phase1_indels/dindel/20110208.chr20.dindel2.ALL.sites.fixed.vcf" - val SI: String = "/humgen/gsa-scr1/delangel/officialCalls/20101123.chr20.si.v2.combined.sites.leftAligned.vcf" - val BI: String = "/humgen/1kg/processing/official_release/phase1/ALL.wgs.broad.20101123.indels.sites.vcf" - val BC: String = "/humgen/gsa-scr1/delangel/officialCalls/20110201_chr20_phase1_indels/ALL.chr20.bc.20101123.indels.sites.leftAligned.vcf" - val OX: String = "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.chr20.Oxford.20110407.indels.genotypes.sites.vcf" - - var COMPS: List[Comp] = Nil - def addComp(comp: Comp) { COMPS = comp :: COMPS } - - var EVALS: List[Eval] = Nil - def addEval(eval: Eval) { EVALS = eval :: EVALS } - def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, file.getName)) } - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - this.memoryLimit = Some(2) - // this.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - this.jobQueue = "hour" - this.intervalsString = List(TARGET_INTERVAL); - - } - class Comp(val name: String, val evalType: String, val filename: String) { - val file: File = new File(filename) - } - - class Eval(val name: String, val evalType: String, val filename: String ) { - val file: File = new File(filename) - } - - def initializeStandardDataFiles() = { - // - // Standard evaluation files for indels - // - //addComp(new Comp("CG.38samples", "indels", COMPS_DIR+"CG.Indels.leftAligned.b37.vcf")) - addComp(new Comp("g1k.pilot1.validation", "indels", COMPS_DIR+"pilot1_indel_validation_2009.b37.vcf")) - //addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) - addComp(new Comp("NA12878.Mullikin", "indels", COMPS_DIR+"NA12878.DIPline.NQScm.expanded.chr20.b37.minReads_2_or_gt2bp.vcf")) - addComp(new Comp("Mills.25pct", "indels", "/humgen/gsa-scr1/delangel/devine_data/indel_hg19_051711_leftAligned_25percent_chr20.vcf")) - //addComp(new Comp("Phase1Validation", "indels", "/humgen/gsa-scr1/delangel/VQSRIndels/1KG_Validation_Phase1_SNPs_05032011.HG19.finalized.vcf")) - addComp(new Comp("Phase1Validation", "indels", "/humgen/gsa-scr1/delangel/VQSRIndels/1000G.20101123.validation_set_v1.QCed.indels.vcf")) - - - // - // INDEL call sets - // - - if ( EVAL_STANDARD_1000G_CALLS ) { - addEval(new Eval("dindel", "indels",qscript.DINDEL)) - addEval(new Eval("si", "indels",qscript.SI)) - addEval(new Eval("bi", "indels", qscript.BI)) - addEval(new Eval("bc", "indels", qscript.BC)) - addEval(new Eval("ox", "indels", qscript.OX)) - addEval(new Eval("2of5", "indels", "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.indels.2of5.chr20.vcf")) - addEval(new Eval("2of5noMulti", "indels", "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.indels.2of5.chr20.noMultiAllelic.vcf")) - addEval(new Eval("union", "indels", "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.indels.combined.chr20.vcf")) -// addEval(new Eval("unionNoMulti", "indels", "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.indels.combined.chr20.noMultiAllelic.vcf")) - } - - } - - def script = { - - initializeStandardDataFiles(); - - var ts:Double = 0.0 - var tranches = List("99.9","99.0","98.0","97.0","96.0","95.0","92.0","90.0","85.0","80.0","70.0") - - var numG:Int = qscript.numG - var pctBad:Double = qscript.pctBad - val runName:String = qscript.runName + "_mG%d_pb%1.2f_QD_FS_HS_RP_IC".format(numG,pctBad) - - val rawCalls = qscript.rawCalls - var tranchesFile = new File(qscript.baseDir +"%s.tranches".format(runName)) - var recalFile = new File(qscript.baseDir +"%s.recal".format(runName)) - var rscriptFile = new File(qscript.baseDir +"%s.plots.R".format(runName)) - - - - var vr = new VariantRecalibrator with CommandLineGATKArgs - vr.rodBind :+= RodBind("input", "VCF",rawCalls ) - vr.rodBind :+= RodBind("truth", "VCF",qscript.truthFile,"known=true,training=false,truth=true,prior=15.0" ) - vr.rodBind :+= RodBind("training", "VCF",qscript.trainingFile,"known=true,training=true,truth=false,prior=12.0" ) - //vr.rodBind :+= RodBind("training2", "VCF",qscript.dindelCalls,"known=true,training=true,truth=false,prior=12.0" ) - vr.rodBind :+= RodBind("dbsnp", "VCF",qscript.MY_DBSNP,"known=true,training=false,truth=false,prior=8.0" ) - - vr.rodBind :+= RodBind("BC", "VCF",qscript.BC,"consensus=true" ) - vr.rodBind :+= RodBind("BI", "VCF",qscript.BI,"consensus=true" ) - vr.rodBind :+= RodBind("SI", "VCF",qscript.SI,"consensus=true" ) - vr.rodBind :+= RodBind("DINDEL", "VCF",qscript.DINDEL,"consensus=true" ) - vr.rodBind :+= RodBind("OXFORD", "VCF",qscript.OX,"consensus=true" ) - - vr.mode = VariantRecalibratorArgumentCollection.Mode.INDEL - vr.tranchesFile = tranchesFile - vr.recalFile = recalFile - vr.rscriptFile = rscriptFile -// vr.an = List("QD","FS","HaplotypeScore","ReadPosRankSum","InbreedingCoeff","SB","") - vr.an = List("QD","FS","HaplotypeScore","ReadPosRankSum","InbreedingCoeff") - vr.maxGaussians = Some(numG) - vr.tranche = tranches - vr.nt = Some(8) - vr.percentBad = Some(pctBad) - vr.std = Some(12.0) - //vr.ignore_filter = List("LowQual") - add(vr) - - val VE = new MyEval() - VE.VT = VARIANT_TYPE_VT("indels") - VE.o = new File(OUT_DIR+"/"+ runName + ".eval") - - for (tas: String <- tranches) { - ts = tas.toDouble - val outFile = new File("/humgen/gsa-hpprojects/dev/delangel/Phase1Calls/20110603VQSRConsensus/calls/phase1.chr20.recal_%s_ts_%4.1f.indels.sites.vcf".format(runName,ts)) - - var ar = new ApplyRecalibration with CommandLineGATKArgs - ar.rodBind :+= RodBind("input", "VCF",rawCalls ) - ar.mode = VariantRecalibratorArgumentCollection.Mode.INDEL - ar.tranchesFile = tranchesFile - ar.recalFile = recalFile - ar.ts_filter_level = Some(ts) - ar.sites_only = true - ar.o = outFile - add(ar) - - VE.rodBind :+= RodBind("eval_ts%4.1f".format(ts), "VCF", ar.o) - - } - - - //VE.nt = Some(8) - - // add evals - for ( calls <- EVALS ) - VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) - - // add comps - // VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) - for ( comp <- COMPS ) - VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.file) - - add(VE) - - var ve2 = new MyEval - for (tas: String <- tranches) { - ts = tas.toDouble - val outFile = new File("/humgen/gsa-hpprojects/dev/delangel/Phase1Calls/20110603VQSRConsensus/calls/phase1.chr20.recal_%s_ts_%4.1f.indels.sites.vcf".format(runName,ts)) - ve2.rodBind :+= RodBind("eval_ts%4.1f".format(ts), "VCF", outFile) - } - - // comps are now other callsets to measure overlap - ve2.rodBind :+= RodBind("comp_dindel", "VCF",qscript.DINDEL) - ve2.rodBind :+= RodBind("comp_bc", "VCF", qscript.BC) - ve2.rodBind :+= RodBind("comp_bi", "VCF", qscript.BI) - ve2.rodBind :+= RodBind("comp_ox", "VCF", qscript.OX) - ve2.rodBind :+= RodBind("comp_2of5", "VCF", "/humgen/gsa-scr1/delangel/otherIndelCallerAnalysis/ALL.indels.2of5.chr20.vcf") - ve2.VT = VARIANT_TYPE_VT("indels") - ve2.o = new File(OUT_DIR+"/"+ runName + ".comps.eval") - add(ve2) - } - - - /** - * Base class for VariantEval used here - */ - class MyEval() extends VariantEval with CommandLineGATKArgs { - this.noST = true - this.nt = Some(8) - this.evalModule :+= "ValidationReport" - //this.evalModule :+= "IndelMetricsByAC" - this.evalModule :+= "IndelStatistics" - this.evalModule :+= "CountVariants" - this.evalModule :+= "CompOverlap" - //this.evalModule :+= "IndelClasses" - } - - - -} diff --git a/scala/qscript/oneoffs/depristo/1kg_table1.scala b/scala/qscript/oneoffs/depristo/1kg_table1.scala deleted file mode 100755 index 020f66e4e..000000000 --- a/scala/qscript/oneoffs/depristo/1kg_table1.scala +++ /dev/null @@ -1,178 +0,0 @@ -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType} -import org.broadinstitute.sting.playground.utils.report.VE2ReportFactory.VE2TemplateType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript - -class Onekg_table1 extends QScript { - @Argument(doc="stage") - var stage: String = _ - - @Argument(doc="gatkJarFile") - var gatkJarFile: File = _ - - @Argument(shortName = "R", doc="gatkJarFile") - var referenceFile: File = _ - -trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; reference_sequence = referenceFile } // -L 1 - -class Target(project: String, snpVCF: String, indelVCF: String, calledGenome: Double, targetGenome: Double, pop: String, pilot : String, bam: String = null) { - def reportFile: String = List(pop, pilot, "report").mkString(".") - def extraArgs = { - val basic = "--project %s --snps %s --calledGenome %f --totalGenome %f --pop %s".format(project, snpVCF, calledGenome, targetGenome, pop) - basic + (if ( indelVCF == null ) "" else " --indels " + indelVCF) - } - - def getPilot = pilot - def getProject = project - def getPop = pop - def getSNPVCF = snpVCF - def getIndelVCF = indelVCF - def hasIndelVCF = indelVCF != null - def getBAM = bam - def hasBAM = bam != null - def getDOC = List(getPilot, getPop, getProject, "doc").mkString(".") - def getDOCSummaryFile = "doc/" + getDOC + ".sample_summary" - def hasDOC = hasBAM - private def getEval(t: String) = List(getPilot, getPop, getProject, t, "eval").mkString(".") - def getSNPEval = getEval("snps") - def getIndelEval = getEval("indels") -} - -val RELEASE = "/humgen/1kg/DCC/ftp/release/2010_07/" - -var targets: List[Target] = List() - -val p1Targets = List(("CEU", 2.43e9), ("YRI", 2.39e9), ("CHBJPT", 2.41e9)) - -for ( (pop: String,called) <- p1Targets ) - targets ::= new Target("SRP000031", pop + ".pilot1.vcf", "v1/dindel-v2/"+pop+".low_coverage.2010_06.indel.genotypes.vcf", called, 2.85e9, pop, "pilot1") - -// pilot 2 -val p2Targets = List(("CEU", 2.264e9), ("YRI", 2.214e9)) -for ( (pop: String, called) <- p2Targets ) - targets ::= new Target("SRP000032", RELEASE + "trio/snps/" + pop + ".trio.2010_03.genotypes.vcf.gz", "v1/dindel-v2/"+pop+".trio.2010_06.indel.genotypes.vcf", called, 2.85e9, pop, "pilot2") - -// pilot 3 -for (pop <- List("CEU", "CHB", "CHD", "JPT", "LWK", "TSI", "YRI")) { - val indels = if ( pop != "LWK" ) "exon/indel/"+pop+".exon.2010_06.genotypes.vcf.gz" else null - targets ::= new Target("SRP000033", "exon/snps/" + pop + ".exon.2010_03.genotypes.vcf.gz", indels, 1.43e6, 1.43e6, pop, "pilot3", "/humgen/gsa-hpprojects/1kg/1kg_pilot3/useTheseBamsForAnalysis/pilot3.%s.cleaned.bam".format(pop)) -} - -// merged files -targets ::= new Target("SRP000031", "pilot1.snps.merged.vcf", "pilot1.indels.merged.vcf", 2.42e9, 2.85e9, "all", "pilot1.merged") -targets ::= new Target("SRP000032", "pilot2.snps.merged.vcf", "pilot2.indels.merged.vcf", 2.565e9, 2.85e9, "all", "pilot2.merged") -targets ::= new Target("SRP000033", "pilot3.snps.merged.vcf", "pilot3.indels.merged.vcf", 1.43e7, 1.43e7, "all", "pilot3.merged") -targets ::= new Target("SRP00003.", "1kg.snps.merged.vcf", "1kg.indels.merged.vcf", 2.42e7, 2.85e9, "all", "1kg.merged") - -val INTERVALS = Map( - "pilot1" -> null, - "pilot2" -> null, - "pilot3" -> "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" - ) - -def script = stage match { - case "ALL" => - // initial pilot1 merge -- autosomes + x - for ( (pop: String,called) <- p1Targets ) { - val auto = RELEASE + "low_coverage/snps/"+ pop +".low_coverage.2010_07.genotypes.vcf.gz" - // todo -- remove fixed when Laura gives us the official calls - val x = RELEASE + "low_coverage/snps/"+ pop +".low_coverage.2010_07.xchr.fixed.genotypes.vcf" - val combineSNPs = new Combine(List(auto, x), pop + ".pilot1.vcf") - add(combineSNPs) - } - - // create pilot wide merges - val pilots = List("pilot2", "pilot1", "pilot3") // order of perference in merging - for ( pilot <- pilots ) { - val pilotTargets = targets filter (_.getPilot == pilot) - val combineSNPs = new Combine(pilotTargets.map(_.getSNPVCF), pilot + ".snps.merged.vcf") - add(combineSNPs) - - if ( pilotTargets(0).getIndelVCF != null ) { - val combineIndels = new Combine(pilotTargets.map(_.getIndelVCF).filter((x: String) => x != null), pilot + ".indels.merged.vcf") - add(combineIndels) - } - } - - // create project wide merges - val snps = "1kg.snps.merged.vcf" - val indels = "1kg.indels.merged.vcf" - - //add(new Combine(pilots.map(_ + ".snps.merged.vcf"), snps)) - add(new Combine(pilots.map(_ + ".indels.merged.vcf"), indels)) - - - case "EVAL" => - // VariantEval of the SNPs - for (target <- targets) { - add(new VariantEval(target.getSNPVCF, target.getSNPEval)) - //add(new StatPop(target)) - } - - case "DOC" => - for (target <- targets) { - if ( target.hasBAM ) - add(new DepthOfCoverage(target.getBAM, target.getDOC, INTERVALS(target.getPilot))) - } - case "MASK" => - for ( pop <- List("CEU", "YRI", "CHBJPT") ) - add(new MaskStats(pop)) - - case _ => throw new Exception("Unknown stage" + stage) -} - -// Using scala anonymous classes -class VariantEval(vcfIn: String, evalOut: String, vcfType: String = "VCF") extends org.broadinstitute.sting.queue.extensions.gatk.VariantEval with UNIVERSAL_GATK_ARGS { - val vcfFile = new File(vcfIn) - this.rodBind :+= RodBind("eval", vcfType, vcfFile) - this.out = new File(evalOut) - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod") - this.reportType = VE2TemplateType.Grep - this.noStandard = true; - this.evalModule :+= "CompOverlap" - this.memoryLimit = 3 - - override def dotString = "VariantEval: " + vcfFile.getName -} - -class StatPop(target: Target) extends CommandLineFunction { - @Input(doc="foo") var snpVCF = new File(target.getSNPVCF) - @Input(doc="foo") var snpEval = new File(target.getSNPEval) - @Input(doc="foo", required=false) var indelVCF: File = if (target.hasIndelVCF) new File(target.getIndelVCF) else { null } - @Output(doc="foo") var reportFile: File = new File(target.reportFile) - override def dotString = "1kgStats: " + reportFile - def commandLine = "python ~/dev/GenomeAnalysisTK/trunk/python/1kgStatsForCalls.py -v -a pilot_data.alignment.index -s pilot_data.sequence.index -r /broad/1KG/DCC/ftp/ -o " + target.reportFile + " " + target.extraArgs + (if (target.hasDOC) " -c " + target.getDOCSummaryFile else "") + " --snpsEval " + target.getSNPEval + (if (target.hasIndelVCF) " --indels " + target.getIndelVCF else "") -} - -class Combine(vcfsInArg: List[String], vcfOutPath: String) extends org.broadinstitute.sting.queue.extensions.gatk.CombineVariants with UNIVERSAL_GATK_ARGS { - val vcfs = vcfsInArg.map((x: String) => new File(x)) - val vcfFile = new File(vcfOutPath) - this.variantmergeoption = VariantMergeType.UNION - this.genotypemergeoption = GenotypeMergeType.PRIORITIZE - this.out = vcfFile - this.rodBind ++= vcfs.map( input => RodBind(input.getName,"VCF",input) ) - this.rod_priority_list = vcfs.map( _.getName ).mkString(",") - override def dotString = "CombineVariants: " + vcfs.map(_.getName).mkString(",") + " => " + vcfFile.getName -} - -class MaskStats(pop: String) extends CommandLineFunction { - @Output(doc="foo") var outFile: File = new File(pop + ".stats") - def commandLine = "python ~/dev/GenomeAnalysisTK/trunk/python/maskStats.py masks/" + pop + ".mask.fa.gz -x MT -x Y -o " + outFile -} - -class DepthOfCoverage(bam: String, docOutPath: String, interval: String) extends org.broadinstitute.sting.queue.extensions.gatk.DepthOfCoverage with UNIVERSAL_GATK_ARGS { - val bamFile = new File(bam) - this.omitIntervalStatistics = true - this.omitDepthOutputAtEachBase = true - this.minBaseQuality = 0 - this.minMappingQuality = 0 - this.out = new File(docOutPath) - this.input_file :+= bamFile - if (interval != null) { - this.intervalsString :+= interval - this.excludeIntervalsString ++= List("MT", "Y") - } - - override def dotString = "DOC: " + bamFile.getName -} -} diff --git a/scala/qscript/oneoffs/depristo/AssessChipCoverageOfPanel.q b/scala/qscript/oneoffs/depristo/AssessChipCoverageOfPanel.q deleted file mode 100755 index 00eb5cc81..000000000 --- a/scala/qscript/oneoffs/depristo/AssessChipCoverageOfPanel.q +++ /dev/null @@ -1,93 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript - -class AssessChipCoverageOfPanel extends QScript { - qscript => - @Argument(doc="Path to GATK jar",required=false,shortName="gatkjarfile") var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") - @Argument(doc="Panel VCF",required=true,shortName="panelVCF") var panelVCF: File = _ - @Argument(doc="BAM File",required=true, shortName="bam") var bam: File = null - @Argument(doc="Bundle path",required=false, shortName="bundle") var bundle: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/") - - @Argument(shortName = "R", doc="ref", required=true) - var referenceFile: File = _ - - @Argument(shortName = "L", doc="intervals", required=false) - val TARGET_INTERVAL: String = null; - - def HM3_VCF: File = new File(bundle + "/hapmap_3.3.b37.sites.vcf") - def OMNI_VCF: File = new File(bundle + "/1000G_omni2.5.b37.sites.vcf") - - trait GATKArgs extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - if ( TARGET_INTERVAL != null ) - this.intervalsString = List(TARGET_INTERVAL); - this.reference_sequence = referenceFile; - this.memoryLimit = 2 - } - - // -------------------------------------------------------------------------------- - // - // GENOTYPING SPECIFIC SITES IN A BAM FILE - // - // -------------------------------------------------------------------------------- - - class GenotypeBAMAtSites(@Input bam: File, @Input sitesVCF: File, @Output genotypesVCF: File) extends UnifiedGenotyper with GATKArgs { - this.input_file = List(bam) - this.o = genotypesVCF - this.stand_call_conf = 0.0 - this.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - this.gt_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - this.rodBind :+= new RodBind("alleles","VCF",sitesVCF) - - // we only want chromosome counts annotations - this.BTI = "alleles" - this.G = List("none") - this.A :+= "ChromosomeCounts" - this.nsl = true - this.nt = 4 - - // make sure we have the right intervals - this.BTIMR = org.broadinstitute.sting.utils.interval.IntervalSetRule.INTERSECTION - } - - class EvalCalls(@Input vcf: File) extends VariantEval with GATKArgs { - this.o = swapExt(vcf, ".vcf", ".vcf.eval") - this.rodBind :+= RodBind("eval", "VCF", vcf) - this.rodBind :+= RodBind("compOMNI", "VCF", OMNI_VCF) - this.rodBind :+= RodBind("compHapMap3", "VCF", HM3_VCF) - } - - class AnnotateCalls(@Input vcf: File, @Output file: File) extends VariantAnnotator with GATKArgs { - this.o = file - this.rodBind :+= RodBind("variant", "VCF", vcf) - this.rodBind :+= RodBind("compOMNI", "VCF", OMNI_VCF) - this.rodBind :+= RodBind("compHapMap3", "VCF", HM3_VCF) - this.rodBind :+= RodBind("panel", "VCF", panelVCF) - this.expression = List("panel.AC", "panel.AN", "panel.AF") - } - - class MakeTable(@Input vcf: File) extends VariantsToTable with GATKArgs { - @Output val table = new File(swapExt(vcf, ".vcf", ".vcf.table")) - this.o = table - this.rodBind :+= RodBind("variants", "VCF", vcf) - this.allowMissingData = true - this.fields = List("CHROM", "POS", "REF", "ALT", "TRANSITION", "HapMap3", - "OMNI", "AC", "AN", "AF", "panel.AC", "panel.AN", "panel.AF") - } - - def script = { - val genotyped = new File(swapExt(bam, ".bam", "_genotyped_at." + panelVCF.getName).getName) - val panelAnnotated = new File(swapExt(panelVCF, ".vcf", ".annotated.vcf")) - val annotated = new File(swapExt(genotyped, ".vcf", ".annotated.vcf")) - - add(new GenotypeBAMAtSites(bam, panelVCF, genotyped)) - add(new AnnotateCalls(panelVCF, panelAnnotated)) - add(new AnnotateCalls(genotyped, annotated)) - add(new EvalCalls(annotated)) - add(new MakeTable(annotated)) - add(new MakeTable(panelAnnotated)) - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/depristo/CleaningTest.scala b/scala/qscript/oneoffs/depristo/CleaningTest.scala deleted file mode 100755 index dc360a4b3..000000000 --- a/scala/qscript/oneoffs/depristo/CleaningTest.scala +++ /dev/null @@ -1,145 +0,0 @@ -package oneoffs.depristo - -//import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import collection.JavaConversions._ -import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - - -class CleaningTest extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=false) - var gatkJar: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") - - @Input(doc="the chromosome to process", shortName="chr", required=false) - var chr: String = "20" - - @Input(doc="the chromosome to process", shortName="L", required=false) - var range: String = _ - - @Input(doc="output path", shortName="outputDir", required=false) - var outputDir: String = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/isizeConstrainedRealigner/" - - @Input(doc="base output filename", shortName="baseName", required=false) - var baseName: String = "" - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) - var outputTmpDir: String = "/broad/shptmp/depristo/tmp" - - @Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false) - var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar") - var picardValidateJar: File = new java.io.File("/seq/software/picard/current/bin/ValidateSamFile.jar") - var picardSortSamJar: File = new java.io.File("/seq/software/picard/current/bin/SortSam.jar") - - private val tmpDir: File = new File("/broad/shptmp/depristo/tmp/") - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" -// val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - -// private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = 4 - this.jobTempDir = qscript.tmpDir - } - - def script = { - val interval = qscript.chr - val bamList: File = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/isizeConstrainedRealigner/CEU.chr%s.list".format(qscript.chr)) - //val bamList: File = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/isizeConstrainedRealigner/FIN.chr%s.3samples.list".format(qscript.chr)) - val targetIntervals: File = new File("%s/chr_%s.intervals".format(outputDir, qscript.chr)) - - Console.println("interval " + interval) - - // 1.) Create cleaning targets - var target = new RealignerTargetCreator with CommandLineGATKArgs - target.input_file :+= bamList - target.intervalsString :+= interval - target.out = targetIntervals - target.mismatchFraction = 0.0 - target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) - target.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) - //target.jobName = baseName + ".target" - add(target) - - for ( cm <- List(true, false) ) { - // 2.) Clean without SW - var clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(outputDir + "cleaned.cm_%b.bam".format(cm)) - - clean.input_file :+= bamList - clean.intervalsString :+= interval + (if ( range != null ) ":" + range else "") - clean.targetIntervals = targetIntervals - clean.out = if ( cm ) cleanedBam else new File(cleanedBam + ".intermediate.bam") - clean.doNotUseSW = true - clean.constrainMovement = cm - clean.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) - clean.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) - //clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true - //clean.jobName = baseName + cm + ".clean" - - Console.println("CLEAN") - add(clean) - - if ( ! cm ) { - // Explicitly run fix mates if the function won't be scattered. - val fixMates = new PicardBamFunction { - // Declare inputs/outputs for dependency tracking. - @Input(doc="unfixed bam") var unfixed: File = _ - @Output(doc="fixed bam") var fixed: File = _ - def inputBams = List(unfixed) - def outputBam = fixed - } - - //fixMates.jobOutputFile = new File(".queue/logs/Cleaning/%s/FixMates.out".format(sampleId)) - fixMates.memoryLimit = 4 - fixMates.jarFile = qscript.picardFixMatesJar - fixMates.unfixed = clean.out - fixMates.fixed = cleanedBam - //fixMates.analysisName = "FixMates" - - // Add the fix mates explicitly - Console.println("fixMates") - add(fixMates) - } - - val validate = new JavaCommandLineFunction { - // Declare inputs/outputs for dependency tracking. - @Input(doc="unfixed bam") var unfixed: File = _ - def inputBams = List(unfixed) - override def commandLine = super.commandLine + "%s%s%s IGNORE=INVALID_CIGAR IGNORE=MATE_NOT_FOUND".format( - optional(" VALIDATION_STRINGENCY=", "SILENT"), repeat(" INPUT=", inputBams), " TMP_DIR=" + jobTempDir) - } - - //fixMates.jobOutputFile = new File(".queue/logs/Cleaning/%s/FixMates.out".format(sampleId)) - validate.memoryLimit = 2 - validate.jarFile = qscript.picardValidateJar - validate.unfixed = cleanedBam - add(validate) - - val toQueryName = new PicardBamFunction { - // Declare inputs/outputs for dependency tracking. - @Input(doc="coordiante bam") var cobam: File = _ - @Output(doc="query bam") var qnbam: File = _ - def inputBams = List(cobam) - def outputBam = qnbam - } - - //fixMates.jobOutputFile = new File(".queue/logs/Cleaning/%s/FixMates.out".format(sampleId)) - toQueryName.memoryLimit = 4 - toQueryName.jarFile = qscript.picardSortSamJar - toQueryName.cobam = cleanedBam - toQueryName.qnbam = new File(cleanedBam.getAbsolutePath + ".qn.bam") - add(toQueryName) - - Console.println("loop done") - } - } -} diff --git a/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala b/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala deleted file mode 100755 index 919cb105a..000000000 --- a/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala +++ /dev/null @@ -1,80 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - -class ExomePostQCEval extends QScript { - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - // todo -- should accept argument list as well - @Argument(shortName = "eval", doc="VCFs to evaluate", required=true) - var evalVCFs: List[File]= _ - - @Argument(shortName = "intervals", doc="intervals", required=true) - val myIntervals: String = null; - - @Argument(shortName = "RPath", doc="RPath", required=false) - var RPath: File = new File("../R") - - @Argument(shortName = "dbSNP", doc="dbSNP", required=false) - val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_132.b37.vcf") - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.reference_sequence = referenceFile; - this.memoryLimit = 4 - } - - // TODO -- should include "standard" eval for plotting expectations - - def script = { - for ( evalVCF <- evalVCFs ) { - // The basic summary eval - createEval(evalVCF, ".summary", - List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), - List("FunctionalClass")) - - // The basic summary eval, by AF - createEval(evalVCF, ".byAC", - List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), - List("AlleleCount")) - - // By sample - createEval(evalVCF, ".bySample", - List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), - List("Sample")) - add(new ExomeQCRScript(evalVCF)) - } - } - - def createEval(evalVCF: File, prefix: String, evalModules: List[String], extraStrats: List[String]) { - val eval = new Eval(evalVCF) - eval.out = swapExt(evalVCF,".vcf", prefix + ".eval") - eval.evalModule = evalModules - eval.stratificationModule = List("EvalRod", "CompRod", "Novelty") ::: extraStrats - add(eval) - } - - class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("eval", "VCF", vcf) - if ( dbSNP.exists() ) - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.doNotUseAllStandardStratifications = true - this.doNotUseAllStandardModules = true - this.intervalsString = List(myIntervals); - } - - class ExomeQCRScript(vcf: File) extends CommandLineFunction { - @Output var pdf: File = swapExt(vcf,".vcf", ".pdf") - val root = swapExt(vcf,".vcf", "") // remove the prefix - def commandLine = "Rscript %s/exomeQC.R %s %s %s".format(RPath, root, root, pdf) - } -} - diff --git a/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala b/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala deleted file mode 100755 index 30c577806..000000000 --- a/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala +++ /dev/null @@ -1,113 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - -class IndelCallerEvaluation extends QScript { - val BUNDLE = "/humgen/gsa-hpprojects/GATK/bundle/current" - - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File(BUNDLE + "/b37/human_g1k_v37.fasta") - - @Argument(shortName = "bam", doc="BAM", required=true) - val bams: List[File] = null; - - @Argument(shortName = "intervals", doc="intervals", required=false) - val myIntervals: String = null; - - @Argument(shortName = "dcov", doc="dcov", required=false) - val DCOV: Int = 250; - - val dbSNP: File = new File(BUNDLE + "/b37/dbsnp_132.b37.vcf") - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.reference_sequence = referenceFile; - this.memoryLimit = 4 - - if ( intervals != null ) - this.intervalsString = List(myIntervals); - } - - trait CoFoJa extends JavaCommandLineFunction { - override def javaOpts = super.javaOpts // + " -javaagent:lib/cofoja.jar" - } - - def processOne(bam: File, gsaProduction: Boolean): File = { - val rawVCF = new Call(bam, gsaProduction) - add(rawVCF) - - val filterIndels = new FilterIndels(rawVCF.out) - add(filterIndels) - - // create a variant eval for us - add(new Eval(filterIndels.out)) - return filterIndels.out - } - - def script = { - for ( gsaProduction <- List(true, false)) { - val vcfs = bams.map(processOne(_, gsaProduction)) - - val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS - for ( vcf <- vcfs ) - combineCalls.rodBind :+= RodBind(vcf.getName, "VCF", vcf) - - combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - combineCalls.out = "combined" + productionString(gsaProduction) + ".vcf" - add(combineCalls) - - add(new ToTable(combineCalls.out)) - } - } - - class FilterIndels(@Input vcf: File) extends VariantFiltration with UNIVERSAL_GATK_ARGS { - this.variantVCF = vcf - this.filterName = List("Indel_QUAL", "Indel_SB", "Indel_QD") - this.filterExpression = List("\"QUAL<30.0\"", "\"SB>-1.0\"", "\"QD<2.0\"") - this.out = swapExt(vcf,".vcf",".filtered.vcf") - } - - class ToTable(@Input vcf: File) extends VariantsToTable with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", vcf) - this.fields = List("FILTER", "set") - this.out = swapExt(vcf,".vcf",".table") - this.raw = true - } - - class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("eval", "VCF", vcf) - if ( dbSNP.exists() ) - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.doNotUseAllStandardStratifications = true - this.doNotUseAllStandardModules = true - this.evalModule = List("CountVariants", "IndelStatistics", "CompOverlap") - this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression") - this.out = swapExt(vcf,".vcf",".eval") - } - - def productionString(gsaProduction: Boolean): String = { - return if ( gsaProduction ) ".prod" else ".expt" - } - - class Call(@Input(doc="foo") bam: File, gsaProduction: Boolean) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - @Output(doc="foo") var outVCF: File = swapExt(bam,".bam", productionString(gsaProduction) + ".indels.vcf") - this.input_file = List(bam) - this.stand_call_conf = 50.0 - this.stand_emit_conf = 50.0 - this.dcov = DCOV; - this.o = outVCF - - this.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL - this.GSA_PRODUCTION_ONLY = gsaProduction - - if ( dbSNP.exists() ) - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - } -} - diff --git a/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala b/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala deleted file mode 100755 index e5bdb16eb..000000000 --- a/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala +++ /dev/null @@ -1,81 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation -import scala.io.Source._ -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - - -class PrepareBamsForHomogeneityTesting extends QScript { - qscript => - @Argument(doc="Path to GATK jar",required=false,shortName="gatkjarfile") var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") - @Argument(doc="Path to SamToFastq jar",required=false,shortName="SamToFastqjarfile") var SamToFastqJar: File = new File("/seq/software/picard/current/bin/SamToFastq.jar") - @Argument(doc="BAM File",required=true, shortName="bam") var bams: File = null - @Argument(doc="Bundle path",required=false, shortName="bundle") var bundle: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/") - -// @Argument(shortName = "R", doc="ref", required=true) -// var referenceFile: File = bundle + "/human_g1k_v37.fasta" - - @Argument(shortName = "L", doc="intervals", required=false) - val TARGET_INTERVAL: String = null; - - trait GATKArgs extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - if ( TARGET_INTERVAL != null ) - this.intervalsString = List(TARGET_INTERVAL); -// this.reference_sequence = referenceFile; - this.memoryLimit = 2 - } - -// class ClipBAM(@Input in: File, @Output out: File) extends ClipReadsWalker with GATKArgs { -// this.o = out -// this.CT = "1-25" -// this.CR = ClippingRepresentation.HARDCLIP_BASES -// this.OQ = true -// } -// -// case class revert (@Input inBam: File, @Output outBam: File) extends PicardBamFunction { -// @Output(doc="reverted bam index") var revertedBamIndex = new File(outBam + ".bai") -// override def inputBams = List(inBAM) -// override def outputBam = outBam -// override def commandLine = super.commandLine + " CREATE_INDEX=true " -// this.isIntermediate = true -// this.jarFile = qscript.dedupJar -// this.analysisName = queueLogDir + outBam + ".dedup" -// this.jobName = queueLogDir + outBam + ".dedup" -// } - - case class SamToFastq (@Input inBam: File, @Output fastq1: File, @Output fastq2: File, trim: Int) extends JavaCommandLineFunction { - this.jarFile = qscript.SamToFastqJar - override def commandLine = super.commandLine + - Array( - " INPUT=" + inBam, - " FASTQ=" + fastq1, - " VALIDATION_STRINGENCY=SILENT", - " SECOND_END_FASTQ=" + fastq2, - " READ1_TRIM=" + trim, - " READ2_TRIM=" + trim).mkString - - //this.analysisName = queueLogDir + outBam + ".dedup" - //this.jobName = queueLogDir + outBam + ".dedup" - } - - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } - - def script = { - for ( bam <- createListFromFile(bams) ) { - val fastq1 = swapExt(bam, ".bam", ".1.fastq") - val fastq2 = swapExt(bam, ".bam", ".2.fastq") - add(new SamToFastq(bam, fastq1, fastq2, 25)) - } - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala deleted file mode 100755 index bf0231f4d..000000000 --- a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala +++ /dev/null @@ -1,146 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.JavaCommandLineFunction - -class ReducedBAMEvaluation extends QScript { - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "bam", doc="BAM", required=true) - val bam: File = null; - - @Argument(shortName = "reduceIntervals", doc="intervals", required=false) - val REDUCE_INTERVAL: String = null; - - @Argument(shortName = "callingIntervals", doc="intervals", required=false) - val CALLING_INTERVAL: String = null; - - @Argument(shortName = "dcov", doc="dcov", required=false) - val DCOV: Int = 250; - - @Argument(shortName = "minimalVCF", doc="", required=false) - val minimalVCF: Boolean = false; - - val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_132.b37.vcf") - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.reference_sequence = referenceFile; - this.memoryLimit = 4 - } - - trait CoFoJa extends JavaCommandLineFunction { - override def javaOpts = super.javaOpts // + " -javaagent:lib/cofoja.jar" - } - - def script = { - val reducedBAM = new ReduceBAM(bam) - add(reducedBAM) - val reducedBAMVCF = callAndEvaluateBAM(reducedBAM.out) - - val slicedBAM = new SliceBAM(bam) - add(slicedBAM) - val fullBAMVCF = callAndEvaluateBAM(slicedBAM.out) - - val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS - combineCalls.rodBind :+= RodBind("fullBAM", "VCF", fullBAMVCF) - combineCalls.rodBind :+= RodBind("reducedBAM", "VCF", reducedBAMVCF) - combineCalls.rod_priority_list = "reducedBAM,fullBAM" - combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - combineCalls.out = swapExt(reducedBAM.out,".bam",".filtered.combined.vcf") - add(combineCalls) - val eval = new Eval(combineCalls.out) // evaluate the combined VCF - eval.select = List("'set==\"Intersection\"'", "'set==\"fullBAM\"'", "'set==\"reducedBAM\"'", "'set==\"filterInreducedBAM-fullBAM\"'", "'set==\"reducedBAM-filterInfullBAM\"'") - eval.selectName = List("Intersection", "fullBAM", "reducedBAM", "filterInreducedBAM-fullBAM", "reducedBAM-filterInfullBAM") - add(eval) - } - - def callAndEvaluateBAM(bam: File): File = { - val rawVCF = new Call(bam) - add(rawVCF) - - val filterSNPs = new VariantFiltration with UNIVERSAL_GATK_ARGS - filterSNPs.variantVCF = rawVCF.out - filterSNPs.filterName = List("SNP_SB", "SNP_QD", "SNP_HRun") - filterSNPs.filterExpression = List("\"SB>=0.10\"", "\"QD<5.0\"", "\"HRun>=4\"") - filterSNPs.clusterWindowSize = 10 - filterSNPs.clusterSize = 3 - filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf") - add(filterSNPs) - - // create a variant eval for us - add(new Eval(filterSNPs.out)) - - // for convenient diffing - add(new DiffableTable(rawVCF.out)) - add(new DiffableTable(filterSNPs.out)) - - return filterSNPs.out - } - - class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("eval", "VCF", vcf) - if ( dbSNP.exists() ) - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.doNotUseAllStandardStratifications = true - this.doNotUseAllStandardModules = true - this.evalModule = List("TiTvVariantEvaluator", "CountVariants") - this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression") - this.out = swapExt(vcf,".vcf",".eval") - if ( CALLING_INTERVAL != null ) - this.intervalsString = List(CALLING_INTERVAL); - } - - - class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa { - this.memoryLimit = 3 - this.input_file = List(bam) - this.o = swapExt(bam,".bam",".reduced.bam") - this.CS = 20 - this.mravs = 50 - this.mbrc = 10000 - - if ( REDUCE_INTERVAL != null ) - this.intervalsString = List(REDUCE_INTERVAL); - } - - class SliceBAM(bam: File) extends PrintReads with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 3 - this.input_file = List(bam) - this.o = swapExt(bam,".bam",".printreads.bam") - if ( REDUCE_INTERVAL != null ) - this.intervalsString = List(REDUCE_INTERVAL); - } - - class Call(@Input(doc="foo") bam: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - @Output(doc="foo") var outVCF: File = swapExt(bam,".bam",".vcf") - this.input_file = List(bam) - this.stand_call_conf = 50.0 - this.stand_emit_conf = 50.0 - this.dcov = DCOV; - this.o = outVCF - - if ( dbSNP.exists() ) - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - - if ( minimalVCF ) - this.group = List("none") - - if ( CALLING_INTERVAL != null ) { - this.intervalsString = List(CALLING_INTERVAL) - } - } - - class DiffableTable(@Input vcf: File) extends CommandLineFunction { - @Output var out: File = swapExt(vcf,".vcf",".table") - def commandLine = "cut -f 1,2,4,5,7 %s > %s".format(vcf, out) - } -} - diff --git a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q deleted file mode 100755 index 73e57b7f0..000000000 --- a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q +++ /dev/null @@ -1,237 +0,0 @@ -package oneoffs.depristo - -//import net.sf.picard.reference.FastaSequenceFile -//import org.broadinstitute.sting.datasources.pipeline.Pipeline -//import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -//import collection.JavaConversions._ - -class RefineGenotypesWithBeagle extends QScript { - qscript => - - @Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _ - @Argument(doc="Path to GATK jar",required=true,shortName="gatkjarfile") var gatkJarFile: File = _ - @Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _ - @Argument(doc="Reference file",required=true,shortName="R") var reference: File = _ - @Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null - @Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null - @Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 12 - @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve") - - @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false - @Argument(doc="If provided, we'll skip over creating the reference panels, even if apparently required",required=false,shortName="assumeReferencePanelsExist") var assumeReferencePanelsExist: Boolean = false - @Argument(doc="Tmpdir",required=false,shortName="tmpdir") var TMPDIR: File = new File("./") - - // assessing imputation performance - @Argument(doc="VCF sites and alleles for genotyping assessment",required=false,shortName="assessmentSites") var assessmentSites: File = null - @Argument(doc="BAM File for genotyping",required=false, shortName="bam") var bam: File = null - @Argument(doc="Percent of sites that should be left out of BAM VCF to assess imputation", required=false, shortName="flo") - var fractionsLeftOut: List[Double] = List(0.1, 0.2, 0.5, 0.9) - // todo -- this might be best to think about in a different unit -- marker density per bp - - - val MISSING_KEY = "?" - val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf") - val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" - - trait GATKArgs extends CommandLineGATK { - this.reference_sequence = qscript.reference - this.jarFile = qscript.gatkJarFile - this.memoryLimit = 2 - } - - // -------------------------------------------------------------------------------- - // - // GENOTYPING SPECIFIC SITES IN A BAM FILE - // - // -------------------------------------------------------------------------------- - - class GenotypeBAMAtSites(@Input bam: File, @Input sitesVCF: File, @Output genotypesVCF: File) extends UnifiedGenotyper with GATKArgs { - this.input_file = List(bam) - this.o = genotypesVCF - this.stand_call_conf = 0.0 - this.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - this.gt_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - this.rodBind :+= new RodBind("alleles","VCF",sitesVCF) - - // we only want chromosome counts annotations - this.BTI = "alleles" - this.G = List("none") - this.A :+= "ChromosomeCounts" - this.nsl = true - - // make sure we have the right intervals - if ( interval != null ) { - this.intervalsString = List(interval) - this.BTIMR = org.broadinstitute.sting.utils.interval.IntervalSetRule.INTERSECTION - } - } - - // -------------------------------------------------------------------------------- - // - // BEAGLE COMMANDS - // - // -------------------------------------------------------------------------------- - - class BeagleCommand(outputBase: String) extends CommandLineFunction { - this.memoryLimit = BEAGLE_MEM_IN_GB - - // Note: These get set - @Output val beaglePhasedFile: File = new File(outputBase +".phased.gz") - @Output val beagleLikelihoods: File = new File(outputBase +".gprobs.gz") - @Output val beagleRSquared: File = new File(outputBase +".r2") - - def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s out=ignore.me omitprefix=true".format(TMPDIR, BEAGLE_MEM_IN_GB, beagleJar) - } - - class RefineGenotypesWithBeagle(@Input beagleInput: File, moreBeagleArgs: String = "") - extends BeagleCommand(beagleInput.getName) { - def myArgs = " like=%s %s".format(beagleInput.getAbsolutePath, moreBeagleArgs) - override def commandLine = super.commandLine + myArgs - } - - class ImputeMissingGenotypesWithReferencePanel(@Input evalBeagle: File, - @Input phasedBeagleFile: File, - @Input markers: File, - moreBeagleArgs: String = "") - extends BeagleCommand(evalBeagle.getName) { - def myArgs = " unphased=%s phased=%s markers=%s %s".format(evalBeagle.getAbsolutePath, - phasedBeagleFile.getAbsolutePath, markers.getAbsolutePath, moreBeagleArgs) - override def commandLine = super.commandLine + myArgs - } - - class GunzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { - def commandLine = "gunzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath) - } - - // -------------------------------------------------------------------------------- - // - // CREATING AND EVALUATING REFERENCE PANELS - // - // -------------------------------------------------------------------------------- - - class ReferencePanelBuilder(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") { - val beagleInput = new ProduceBeagleInput with GATKArgs - if ( interval != null ) beagleInput.intervalsString = List(interval) - beagleInput.variantVCF = inputVCF - beagleInput.out = swapExt(outputVCF,".vcf",".beagle") - if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE - beagleInput.markers = swapExt(outputVCF, ".vcf", ".markers.txt") - - val refine = new RefineGenotypesWithBeagle(beagleInput.out, moreBeagleArgs) - - val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) - val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) - - val vcfConvert = new BeagleOutputToVCF with GATKArgs - vcfConvert.variantVCF = inputVCF - vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) - vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out) - vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out) - vcfConvert.out = outputVCF - - def getMarkers = beagleInput.markers - def getPanelPhasedHaplotypes = refine.beaglePhasedFile - def getPanelVCF = vcfConvert.out - - def enqueueCommands() = { - for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) ) - add(cmd) - } - } - - class EvaluateReferencePanel(@Input evalVCF: File, - @Output outputVCF: File, - panel: ReferencePanelBuilder, - percentLeftOut: Double, - moreBeagleArgs: String = "") { - val evalBeagle = new VariantsToBeagleUnphased with GATKArgs - if ( interval != null ) evalBeagle.intervalsString = List(interval) - evalBeagle.variantVCF = evalVCF - evalBeagle.out = swapExt(outputVCF,".vcf",".unphased.beagle") - evalBeagle.bs = percentLeftOut - evalBeagle.bsvcf = swapExt(outputVCF,".vcf",".missing.vcf") - evalBeagle.missing = MISSING_KEY - //evalBeagle.isIntermediate = true - - val refine = new ImputeMissingGenotypesWithReferencePanel(evalBeagle.out, panel.getPanelPhasedHaplotypes, panel.getMarkers, moreBeagleArgs) - - val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) - val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) - //unzipPhased.isIntermediate = true - //unzipProbs.isIntermediate = true - - val vcfConvert = new BeagleOutputToVCF with GATKArgs - vcfConvert.variantVCF = evalVCF - vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) - vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out) - vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out) - vcfConvert.out = outputVCF - vcfConvert.keep_monomorphic = true - - def getBootstrap: File = evalBeagle.bsvcf - - def enqueueCommands() = { - for ( cmd: CommandLineFunction <- List(evalBeagle, refine, unzipPhased, unzipProbs, vcfConvert) ) - add(cmd) - } - } - - class EvalPanelAtChipSites(@Input eval: File) extends VariantEval with GATKArgs { - this.noST = true - this.evalModule :+= "GenotypeConcordance" - this.o = swapExt(eval, ".vcf", ".vcf.eval") - this.rodBind :+= RodBind("eval", "VCF", eval) - this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF) - this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF) - if ( EvalInterval != null ) this.intervalsString = List(EvalInterval) - } - - class EvalPanelAtBAMCalledSites(@Input imputedVCF: File, @Input bamGenotypes: File, @Input bootstrap: File) extends VariantEval with GATKArgs { - this.evalModule :+= "GenotypeConcordance" - this.o = swapExt(imputedVCF, ".vcf", ".vcf.eval") - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP_b37) - this.rodBind :+= RodBind("eval", "VCF", imputedVCF) - this.rodBind :+= RodBind("comp_bam_genotypes", "VCF", bamGenotypes) - this.rodBind :+= RodBind("comp_bootstrap", "VCF", bootstrap) - if ( EvalInterval != null ) this.intervalsString = List(EvalInterval) - } - - def script = { - var bamGenotypes: File = null - - if ( bam != null ) { - bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + assessmentSites.getName).getName) - add(new GenotypeBAMAtSites(bam, assessmentSites, bamGenotypes)) - } - - for ( vcf <- vcfsToBeagle ) { - if ( ! TEST ) add(new EvalPanelAtChipSites(vcf)) - for ( useCalibrationCurve <- List(true, false) ) { - for ( niter <- List(10, 20, 50) ) { - if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) { - val refineFilenamePart = "niter_%d_cc_%b".format(niter, useCalibrationCurve) - val refine_out = swapExt(vcf, ".vcf", ".refined.%s.vcf".format(refineFilenamePart)) - val refPanel = new ReferencePanelBuilder(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) - if ( ! assumeReferencePanelsExist ) refPanel.enqueueCommands() - - // start up VE - add(new EvalPanelAtChipSites(refine_out)) - - if ( bamGenotypes != null ) { - for ( fractionLeftOut <- if ( TEST ) List(0.1) else fractionsLeftOut ) { - val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed_with_%s".format(fractionLeftOut, refine_out)) - val args = "niterations=%d missing=\"%s\"".format(niter, MISSING_KEY) - val panelEval = new EvaluateReferencePanel(bamGenotypes, bamGenotypesImputed, refPanel, fractionLeftOut, args) - panelEval.enqueueCommands() - add(new EvalPanelAtBAMCalledSites(bamGenotypesImputed, bamGenotypes, panelEval.getBootstrap)) - } - } - } - } - } - } - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/depristo/VQSRCutByNRS.scala b/scala/qscript/oneoffs/depristo/VQSRCutByNRS.scala deleted file mode 100755 index 7d5d961f1..000000000 --- a/scala/qscript/oneoffs/depristo/VQSRCutByNRS.scala +++ /dev/null @@ -1,107 +0,0 @@ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.QScript -import org.apache.commons.io.FilenameUtils; -import scala.io.Source._ - -class VQSRCutByNRS extends QScript { - // @Input(doc="bamIn", shortName="I", required=true) - // var bamList: File = _ - - @Argument(doc="gatk jar file") - var gatkJarFile: File = _ - - // @Argument(shortName = "R", doc="ref") - // var referenceFile: File = _ - - @Argument(fullName = "prefix", doc="Prefix argument", required=false) - var prefix: String = "" - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = 3 } - -class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) { - def clusterFile = new File(name + ".clusters") -} - -val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") -val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") -val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.fasta") - -val HiSeq = new Target("NA12878.HiSeq", hg18, "hg18", new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), None, 2.07) -val WEx = new Target("NA12878.WEx", hg18, "hg18", new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), Some("~/localData/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list"), 2.6) -val LowPassN60 = new Target("lowpass.N60", b36, "b36", new File("lowpass.N60.chr20.filtered.vcf"), Some("20"), 2.3) -val LowPassAugust = new Target("ALL.august.v3", hg19, "b37", new File("ALL.august.v3.chr20.filtered.vcf"), Some("20"), 2.3) -val TGPWExFH = new Target("1000G.WEx.FH", hg19, "b37", new File("/humgen/gsa-pipeline/PQ7LC/all_batches_v006/Plate_1/SnpCalls/Barcoded_1000G_WEx_Plate_1.cleaned.annotated.handfiltered.vcf"), Some("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"), 3.0) -val TGPWExGdA = new Target("1000G.WEx.GdA", hg19, "b37", new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), Some("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"), 3.0) - -val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA) - -val allTailedAnnotations = List("QD+", "SB-", "HaplotypeScore-", "HRun-") -val someTailedAnnotations = List("QD", "SB-", "HaplotypeScore-", "HRun") -val twoTailedAnnotations = List("QD", "SB", "HaplotypeScore", "HRun") - -def script = { - for (target <- targets) { - add(new GenerateVariantClusters(target) ) - add(new VariantRecalibratorTiTv(target, someTailedAnnotations, ".sb.hs.tailed") ) - add(new VariantRecalibratorNRS(target, someTailedAnnotations, ".sb.hs.tailed") ) - add(new VariantRecalibratorTiTv(target, allTailedAnnotations, ".all.tailed") ) - add(new VariantRecalibratorNRS(target, allTailedAnnotations, ".all.tailed") ) - add(new VariantRecalibratorTiTv(target, twoTailedAnnotations, ".untailed") ) - add(new VariantRecalibratorNRS(target, twoTailedAnnotations, ".untailed") ) - - } -} - -def bai(bam: File) = new File(bam + ".bai") - -val FiltersToIgnore = List("DPFilter", "ABFilter", "ESPStandard", "QualByDepth", "StrandBias", "HomopolymerRun") - -class GenerateVariantClusters(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.GenerateVariantClusters with UNIVERSAL_GATK_ARGS { - this.reference_sequence = t.reference - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod") - this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - this.rodBind :+= RodBind("input", "VCF", t.VCF) - this.clusterFile = t.clusterFile - this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - this.analysisName = t.name + "_Cluster" - if ( t.intervals != None ) this.intervalsString ++= List(t.intervals.get) - this.qual = 300 - this.std = 3.5 - this.mG = 16 // v2 calls - // ignores - this.ignoreFilter ++= FiltersToIgnore -} - - -class VariantRecalibratorBase(t: Target, ans: List[String]) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS { - this.reference_sequence = t.reference - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod") - this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - this.rodBind :+= RodBind("truth", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - this.rodBind :+= RodBind("input", "VCF", t.VCF) - this.clusterFile = t.clusterFile - this.analysisName = t.name+"_VR" - if ( t.intervals != None ) this.intervalsString ++= List(t.intervals.get) - this.ignoreFilter ++= FiltersToIgnore - this.ignoreFilter ++= List("HARD_TO_VALIDATE") - this.priorDBSNP = 2.0 - this.priorHapMap = 2.0 - this.target_titv = t.titvTarget - this.use_annotation ++= ans - this.out = new File("/dev/null") -} - -class VariantRecalibratorTiTv(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t, ans) { - this.tranche ++= List("0.1", "1.0", "10.0", "100.0") - //this.out = new File(t.name + ".titv.recalibrated.vcf") - this.tranchesFile = new File(t.name + prefix + ".titv.tranches") -} - -class VariantRecalibratorNRS(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t,ans) { - this.sm = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY - this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1") - //this.out = new File(t.name + ".ts.recalibrated.vcf") - this.tranchesFile = new File(t.name + prefix + ".ts.tranches") -} -} diff --git a/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala b/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala deleted file mode 100755 index a85928dd2..000000000 --- a/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala +++ /dev/null @@ -1,104 +0,0 @@ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.extensions.gatk._ - -class ManySampleUGPerformanceTesting extends QScript { - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "bams", doc="BAMs", required=true) - val FULL_BAM_LIST: File = null; - - @Argument(shortName = "intervals", doc="intervals", required=true) - val TARGET_INTERVAL: String = null; - - @Argument(shortName = "preMerge", doc="preMerge", required=false) - val PRE_MERGE: Boolean = false; - - @Argument(shortName = "dcov", doc="dcov", required=false) - val DCOV: Int = 50; - - @Argument(shortName = "exome", doc="exome ",required=false) - val EXOME_NSAMPLES: Boolean = false; - - val MERGED_DIR = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/manySampleUGPerformance/") - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.intervals = List(new File(TARGET_INTERVAL)); - this.reference_sequence = referenceFile; - //this.jobQueue = "gsa"; - this.memoryLimit = 4 - //this.commandDirectory = new File("results"); - } - - def script = { - for (nSamples <- if ( EXOME_NSAMPLES) List(1, 2, 5, 10, 25, 50, 100, 200, 300, 400, 500) else List(1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900) ) { -// for (nSamples <- List(10)) { - val sublist = new SliceList(nSamples) - val mergeSublist = new MergeBAMs(sublist.list) - - val name: String = if ( PRE_MERGE ) "pre_merge" else "dynamic_merge" - val bams: File = if ( PRE_MERGE ) mergeSublist.o else sublist.list - - add(sublist) - if ( PRE_MERGE ) { - add(mergeSublist) - add(new Index(mergeSublist.o) ) - } - - // SNP calling - //add(new Call(sublist.list, nSamples, "dynamic_merge")) - add(new Call(bams, nSamples, name)); - - val gtWithBAQ = new Call(bams, nSamples, name + "_baq"); - gtWithBAQ.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE - add(gtWithBAQ) - - // SNP calling -- no annotations - //add(new Call(bams.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; }) - - // CountLoci - //add(new MyCountLoci(sublist.list, nSamples, "dynamic_merge")) - add(new MyCountLoci(bams, nSamples, name)) - } - } - - class Index(bamIn: File) extends SamtoolsIndexFunction { - //this.jobQueue = "gsa" - bamFile = bamIn - } - - class MergeBAMs(bamList: File) extends PrintReads with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 3 - this.input_file :+= bamList - this.memoryLimit = 16 - this.o = new File(MERGED_DIR + "/" + bamList.getName + ".bam") - } - - class Call(@Input(doc="foo") bamList: File, n: Int, name: String) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - @Output(doc="foo") var outVCF: File = new File("%s.%d.%s.vcf".format(bamList.getName, n, name)) - this.input_file :+= bamList - this.stand_call_conf = 10.0 - this.dcov = DCOV; - this.o = outVCF - } - - class MyCountLoci(@Input(doc="foo") bamList: File, n: Int, name: String) extends CountLoci with UNIVERSAL_GATK_ARGS { - @Output(doc="foo") var outFile: File = new File("%s.%d.%s.txt".format(bamList.getName, n, name)) - this.input_file :+= bamList - this.dcov = DCOV; - this.o = outFile - } - - class SliceList(n: Int) extends CommandLineFunction { - @Output(doc="foo") var list: File = new File("bams.%d.list".format(n)) - def commandLine = "head -n %d %s > %s".format(n, FULL_BAM_LIST, list) - //this.jobQueue = "gsa"; - } -} - diff --git a/scala/qscript/oneoffs/depristo/resequencingSamples1KG.scala b/scala/qscript/oneoffs/depristo/resequencingSamples1KG.scala deleted file mode 100644 index b17ccaafc..000000000 --- a/scala/qscript/oneoffs/depristo/resequencingSamples1KG.scala +++ /dev/null @@ -1,41 +0,0 @@ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ - -class resequencingSamples1KG extends QScript { - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - val TARGET_INTERVAL = "my.intervals" - val TEST_BAM_LIST = new File("ten.bam.list") - val FULL_BAM_LIST = new File("/humgen/1kg/processing/allPopulations_chr20_june_release/allPopulations.june.bam.list") - val BAM_LIST = FULL_BAM_LIST - val HM3 = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf") - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.intervals = List(new File(TARGET_INTERVAL)); - this.reference_sequence = referenceFile; - this.jobQueue = "gsa"; - this.et = org.broadinstitute.sting.gatk.phonehome.GATKRunReport.PhoneHomeOption.STANDARD; - this.dcov = 50; - } - - def script = { - // SNP calling - add(new MyQSample(BAM_LIST)); - } - - class MyQSample(@Input(doc="foo") bamList: File) extends QSample with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 4 - this.input_file :+= bamList - //this.BTI = "genotypes" - this.nt = 10 - this.rodBind :+= RodBind("genotypes", "VCF", HM3) - this.o = new File("%s.qsample".format(bamList.getName)) - } -} - diff --git a/scala/qscript/oneoffs/fromer/CNV/ReadDepthCNVanalysis.scala b/scala/qscript/oneoffs/fromer/CNV/ReadDepthCNVanalysis.scala deleted file mode 100644 index 779b05135..000000000 --- a/scala/qscript/oneoffs/fromer/CNV/ReadDepthCNVanalysis.scala +++ /dev/null @@ -1,144 +0,0 @@ -package oneoffs.fromer.CNV - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.util.VCF_BAM_utilities -import java.io.PrintWriter -import org.apache.commons.io.IOUtils - -class ReadDepthCNVanalysis extends QScript { - qscript => - - @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) - var bams: File = _ - - @Argument(doc = "gatk jar file", shortName = "J", required = true) - var gatkJarFile: File = _ - - @Argument(shortName = "R", doc = "ref", required = true) - var referenceFile: File = _ - - @Argument(shortName = "L", doc = "Intervals", required = false) - var intervals: String = _ - - @Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) - var scatterCountInput = 0 - - @Input(doc = "Samples to phase together. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) - var samplesPerJob = 1 - - @Output(doc = "DoC file to output", shortName = "o", required = true) - var outputDoC: File = _ - - @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) - var MAX_DEPTH = 20000 - - @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) - var NUM_BINS = 200 - - @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) - var START_BIN = 1 - - @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) - var minMappingQuality = 0 - - val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary" - - val DOC_MEAN_COVERAGE_OUTPUT: String = ".sample_interval.averageCoverage.txt" - - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervalsString = List(qscript.intervals) - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - //this.memoryLimit = 3 - this.logging_level = "INFO" - } - - // A target has a list of samples and bam files to use for DoC - class Target(val name: String, val samples: List[String], val bams: List[File]) { - var prefix: String = outputDoC.getParent() - if (prefix == null) - prefix = "" - else - prefix = prefix + "/" - - def DoC_output = new File(prefix + name + "." + outputDoC.getName()) - - override def toString(): String = String.format("[Target %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams) - } - - def script = { - val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) - val samples: List[String] = sampleToBams.keys.toList - Console.out.printf("Samples are %s%n", samples) - - val targets: List[Target] = buildTargets(samples, sampleToBams) - - for (target <- targets) { - Console.out.printf("Target is %s%n", target) - add(new DoC(target)) - } - - add(new combineDoC(targets.map(u => new File(u.DoC_output.getPath() + DOC_OUTPUT_SUFFIX)))) - } - - def buildTargets(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[Target] = { - - def buildTargetsHelper(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match { - case (Nil, y) => - return Nil - case (subsamples, remaining) => - return new Target("group" + count, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: - buildTargetsHelper(remaining, count + 1) - } - - return buildTargetsHelper(samples, 0) - } - - class DoC(t: Target) extends CommandLineGATKArgs with ScatterGatherableFunction { - this.analysis_type = "DepthOfCoverage" - - this.input_file = t.bams - - this.downsample_to_coverage = MAX_DEPTH - this.downsampling_type = DownsampleType.BY_SAMPLE - - this.scatterCount = scatterCountInput - this.scatterClass = classOf[IntervalScatterFunction] - - @Output - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var intervalSampleOut: File = new File(t.DoC_output.getPath() + DOC_OUTPUT_SUFFIX) - - val outFile = new File(intervalSampleOut.getParentFile(), t.DoC_output.getName()) - - override def commandLine = super.commandLine + - " --omitDepthOutputAtEachBase --omitLocusTable --minBaseQuality 0 --minMappingQuality " + minMappingQuality + - " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS + - " -o " + outFile - - override def dotString = "DOC: " + t.DoC_output - - this.jobOutputFile = outFile + ".out" - } - - class combineDoC(DoCsToCombine: List[File]) extends CommandLineFunction { - override def description = "Combines DoC outputs for multiple samples (at same loci)" - - @Input(doc = "") - var inputDoCfiles: List[File] = DoCsToCombine - - @Output - val outputDoCaverageCoverage: File = new File(outputDoC.getPath + DOC_MEAN_COVERAGE_OUTPUT) - - var command: String = "~fromer/CNV/wave1+2/scripts/mergeDoC.pl -gatk " + qscript.gatkJarFile.getPath.replaceFirst("dist/GenomeAnalysisTK.jar", "") + " -ref " + qscript.referenceFile + " -out " + outputDoCaverageCoverage - for (input <- inputDoCfiles) { - command += " " + input - } - def commandLine = command - - // Since loading ALL of the output into the perl script can take significant memory: - this.memoryLimit = 9 - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/fromer/PhaseSamples.scala b/scala/qscript/oneoffs/fromer/PhaseSamples.scala deleted file mode 100644 index 1b55d1dd0..000000000 --- a/scala/qscript/oneoffs/fromer/PhaseSamples.scala +++ /dev/null @@ -1,120 +0,0 @@ -package oneoffs.fromer - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.VCF_BAM_utilities - -class PhaseSamples extends QScript { - qscript => - - @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) - var bams: File = _ - - @Input(doc = "master VCF calls", shortName = "C", required = true) - var masterCalls: File = _ - - @Argument(doc = "gatk jar file", shortName = "J", required = true) - var gatkJarFile: File = _ - - @Argument(shortName = "R", doc = "ref", required = true) - var referenceFile: File = _ - - @Argument(fullName = "prefix", doc = "Prefix argument", required = false) - var prefix: String = "" - - @Argument(shortName = "L", doc = "Intervals", required = false) - var intervals: String = null - - @Input(doc = "level of parallelism for BAM phaser. By default is set to 0 [no scattering].", shortName = "scatter", required = false) - var scatterCount = 0 - - @Input(doc = "Samples to phase together. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) - var samplesPerJob = 1 - - @Output(doc = "Phased file to output", shortName = "o", required = true) - var outputPhased: File = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - if (qscript.intervals != null) { - this.intervalsString = List(qscript.intervals) - } - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - this.memoryLimit = 3 - this.logging_level = "INFO" - } - - // A target has a list of samples and bam files to use for phasing - class Target(val name: String, val samples: List[String], val bams: List[File]) { - var prefix: String = outputPhased.getParent() - if (prefix == null) - prefix = "" - else - prefix = prefix + "/" - - def phasedVCFFile = new File(prefix + name + "." + outputPhased.getName()) - - override def toString(): String = String.format("[Target %s [%s] with samples %s against bams %s]", name, phasedVCFFile, samples, bams) - } - - def script = { - if (qscript.scatterCount > 0) throw new RuntimeException("scatter/gather currently not implemented") - - val samples: List[String] = VCF_BAM_utilities.getSamplesFromVCF(masterCalls) - Console.out.printf("Samples are %s%n", samples) - - val targets: List[Target] = bamsToTargets(samples, bams) - - for (target <- targets) { - Console.out.printf("Target is %s%n", target) - add(new PhaseSamples(target)) - } - - add(new CombineVariants(targets.map(_.phasedVCFFile))) - - add(new PhasingByACeval()) - } - - def bamsToTargets(samples: List[String], bamsIn: File): List[Target] = { - val bams: List[File] = VCF_BAM_utilities.parseBAMsInput(bamsIn) - val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(bams) - - def buildTargets(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match { - case (Nil, y) => - return Nil - case (subsamples, remaining) => - return new Target("group" + count, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: - buildTargets(remaining, count + 1) - } - - return buildTargets(samples, 0) - } - - class PhaseSamples(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.ReadBackedPhasing with CommandLineGATKArgs { - this.rodBind :+= RodBind("variant", "VCF", qscript.masterCalls) - this.out = t.phasedVCFFile - this.input_file = t.bams - this.sampleToPhase = t.samples - } - - class CombineVariants(vcfsToCombine: List[File]) extends org.broadinstitute.sting.queue.extensions.gatk.CombineVariants with CommandLineGATKArgs { - for (vcf <- vcfsToCombine) { - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) - } - - // add the master call: - this.rodBind :+= RodBind("master", "VCF", masterCalls) - this.variantMergeOptions = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.MASTER - - this.out = outputPhased - } - - class PhasingByACeval() extends org.broadinstitute.sting.queue.extensions.gatk.PhasingEval with CommandLineGATKArgs { - this.analysis = org.broadinstitute.sting.oneoffprojects.walkers.phasing.PhasingEval.Analysis.PHASING_BY_AC - - this.rodBind :+= RodBind(outputPhased.getName, "VCF", outputPhased) - - this.out = new File("phasing_by_ac." + outputPhased + ".txt") - } - -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/fromer/ScatteredFullVariantAnnotator.scala b/scala/qscript/oneoffs/fromer/ScatteredFullVariantAnnotator.scala deleted file mode 100644 index b575e9980..000000000 --- a/scala/qscript/oneoffs/fromer/ScatteredFullVariantAnnotator.scala +++ /dev/null @@ -1,59 +0,0 @@ -package oneoffs.fromer - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.interval.IntervalSetRule - -class ScatteredFullVariantAnnotator extends QScript { - qscript => - - @Argument(doc = "gatk jar file", shortName = "J", required = true) - var gatkJarFile: File = _ - - @Argument(shortName = "R", doc = "ref", required = true) - var referenceFile: File = _ - - @Argument(shortName = "L", doc = "Intervals", required = false) - var intervals: String = null - - @Input(doc = "level of parallelism. By default is set to 0 [no scattering].", shortName = "scatter", required = false) - var scatterCount = 0 - - @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) - var bams: File = _ - - @Input(doc = "variant calls to annotate", fullName = "variantVCF", shortName = "C", required = true) - var variantVCF: File = _ - - @Output(doc = "annotated file to output", shortName = "o", required = true) - var outputAnnotated: File = _ - - @Output(doc = "Memory limit", fullName = "memoryLimit", shortName = "m", required = false) - var memoryLimit = 3 - - def script = { - add(new ScatteredFullVariantAnnotator()) - } - - trait CommandLineGATKArgs extends CommandLineGATK { - if (qscript.intervals != null) { - this.intervalsString = List(qscript.intervals) - } - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - this.input_file = List(qscript.bams) - - this.memoryLimit = qscript.memoryLimit - this.logging_level = "INFO" - - this.rodToIntervalTrackName = "variant" - this.BTI_merge_rule = IntervalSetRule.INTERSECTION - } - - class ScatteredFullVariantAnnotator() extends org.broadinstitute.sting.queue.extensions.gatk.VariantAnnotator with CommandLineGATKArgs { - this.scatterCount = qscript.scatterCount - this.variantVCF = qscript.variantVCF - this.useAllAnnotations = true - this.out = qscript.outputAnnotated - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/hanna/DoC.scala b/scala/qscript/oneoffs/hanna/DoC.scala deleted file mode 100644 index 5a60f7166..000000000 --- a/scala/qscript/oneoffs/hanna/DoC.scala +++ /dev/null @@ -1,59 +0,0 @@ -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ - -/** - * A pipeline for Queue that runs a custom walker outside of the GATK jar. - * NOTE: This code is an unsupported example for soliciting feedback on how to improve Queue. - * Future syntax will simplify running the GATK so please expect the syntax below to change significantly. - */ -class DoC extends QScript { - // The full packaged jar should be used. - // You can build this jar via 'ant package' and then find it under - // 'Sting/dist/packages/GenomeAnalysisTK-*/GenomeAnalysisTK.jar' - @Input(doc="The path to the packaged GenomeAnalysisTK.jar file.", shortName="gatk") - var gatkJar: File = null - - @Input(doc="The reference file for the bam files.", shortName="R") - var referenceFile: File = null - - // NOTE: Do not initialize List, Set, or Option to null - // as you won't be able to update the collection. - // By default set: - // List[T] = Nil - // Set[T] = Set.empty[T] - // Option[T] = None - @Input(doc="One or more bam files.", shortName="I") - var bamFiles: List[File] = Nil - - @Input(doc="An optional file with a list of intervals to proccess.", shortName="L", required=false) - var intervalsString: List[String] = List("2:87000001-90000000") - - // This trait allows us set the variables below in one place, - // and then reuse this trait on each CommandLineGATK function below. - trait DepthOfCoverageArguments extends CommandLineGATK { - this.jarFile = DoC.this.gatkJar - this.reference_sequence = DoC.this.referenceFile - this.intervalsString = DoC.this.intervalsString - this.memoryLimit = 8 - } - - - def script = { - // Create the four function that we can run. - val doc = new DepthOfCoverage with DepthOfCoverageArguments - doc.downsampling_type = DownsampleType.NONE - doc.omitLocusTable = true - doc.omitIntervals = true - doc.omitSampleSummary = true - - // If you are running this on a compute farm, make sure that the Sting/shell - // folder is in your path to use mergeText.sh and splitIntervals.sh. - //doc.scatterCount = 3 - doc.input_file = DoC.this.bamFiles - doc.out = new File("doc-all.out") - - add(doc) - } - -} diff --git a/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q b/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q deleted file mode 100644 index 7da016b22..000000000 --- a/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q +++ /dev/null @@ -1,59 +0,0 @@ -import java.io.PrintWriter -import org.broadinstitute.sting.queue.function.ListWriterFunction -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException - -/** - * A pipeline for Queue that runs a custom walker outside of the GATK jar. - * NOTE: This code is an unsupported example for soliciting feedback on how to improve Queue. - * Future syntax will simplify running the GATK so please expect the syntax below to change significantly. - */ -class PrintReadsAcrossManySamples extends QScript { - @Input(doc="The reference file for the bam files.", shortName="R") - var referenceFile: File = null - - // NOTE: Do not initialize List, Set, or Option to null - // as you won't be able to update the collection. - // By default set: - // List[T] = Nil - // Set[T] = Set.empty[T] - // Option[T] = None - @Input(doc="One or more bam files.", shortName="I") - var bamFile: File = _ - - @Input(doc="Name of the test case", fullName="test_case",required=false) - var testCaseName: String = "." - - @Input(doc="Max number of bam files to process", fullName="max_bams",required=false) - var maxBams = 1 - - @Input(doc="Step size",fullName="step_size",required=false) - var stepSize = 1 - - - def script = { - var lines = scala.io.Source.fromFile(bamFile).getLines.map(new File(_)).toList - - for(numBams <- 1 to (maxBams min lines.size) by stepSize) { - val basePath = "%s/%03d_bams".format(testCaseName,numBams) - - val writeBamList = new ListWriterFunction - writeBamList.inputFiles = lines.take(numBams) - writeBamList.listFile = new File(basePath+"/bams.list") - add(writeBamList) - - // Create the function that we can run. - val printreads = new PrintReads - - printreads.jobOutputFile = new File(basePath+"/PrintReads.out") - printreads.input_file = List(writeBamList.listFile) - printreads.reference_sequence = referenceFile - printreads.out = new File("/dev/null") - printreads.memoryLimit = 8 - - add(printreads) - } - - } -} diff --git a/scala/qscript/oneoffs/hanna/runprintreads.sh b/scala/qscript/oneoffs/hanna/runprintreads.sh deleted file mode 100644 index daf395ed6..000000000 --- a/scala/qscript/oneoffs/hanna/runprintreads.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh -java -Djava.io.tmpdir=/broad/shptmp/hanna -jar ~/src/Sting/dist/Queue.jar \ - --script PrintReadsAcrossManySamples.q \ - -gatk ~/src/Sting/dist/GenomeAnalysisTK.jar \ - -R /humgen/1kg/reference/human_g1k_v37.fasta \ - -I ~/tests/1600samples/1kg_t2d.list --max_bams 2000 --step_size 10 -bsub -jobQueue week $1 diff --git a/scala/qscript/oneoffs/kshakir/UGMemoryTests.scala b/scala/qscript/oneoffs/kshakir/UGMemoryTests.scala deleted file mode 100644 index 132f12cc8..000000000 --- a/scala/qscript/oneoffs/kshakir/UGMemoryTests.scala +++ /dev/null @@ -1,52 +0,0 @@ -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import collection.JavaConversions._ - -class UGMemoryTests extends QScript { - qscript => - - @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") - var yamlFile: File = _ - - @Input(doc="The path to the GenomeAnalysisTK.jar file.", shortName="gatk") - var gatkJar: File = null - - @Input(doc="per-sample downsampling level",shortName="dcov",required=false) - var downsampling_coverage = 300 - - def script = { - val pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile) - val memoryLimits = List(1,2,4,6,8,10,12,16) - val recalibratedSamples = pipeline.getSamples.map(_.getBamFiles.get("recalibrated")).toList - val squid1 = "C315" - val squid2 = "C338" - - val numBamsList = List(10, 20, 50, 70, 100, 120, 150) - val squid1Bams = recalibratedSamples.filter(_.getAbsolutePath.contains(squid1)) - val squid2Bams = recalibratedSamples.filter(_.getAbsolutePath.contains(squid2)) - - for (memoryLimit <- memoryLimits) { - for (numBams <- numBamsList) { - val dir = "%03d_bams_%02dg".format(numBams, memoryLimit) - - val snps = new UnifiedGenotyper - snps.jobOutputFile = new File(dir, "UnifiedGenotyper.out") - snps.out = new File(dir, "UnifiedGenotyper.vcf") - snps.input_file = squid1Bams.take(numBams/2) ++ squid2Bams.take(numBams/2) - snps.memoryLimit = memoryLimit - - snps.jarFile = qscript.gatkJar - snps.reference_sequence = pipeline.getProject.getReferenceFile - snps.intervals = List(pipeline.getProject.getIntervalList) - snps.rodBind :+= new RodBind("dbsnp", pipeline.getProject.getGenotypeDbsnpType, pipeline.getProject.getGenotypeDbsnp) - snps.downsample_to_coverage = qscript.downsampling_coverage - snps.annotation ++= List("AlleleBalance") - snps.group :+= "Standard" - - add(snps) - } - } - } -} diff --git a/scala/qscript/oneoffs/kshakir/UGMemoryTests.sh b/scala/qscript/oneoffs/kshakir/UGMemoryTests.sh deleted file mode 100755 index 15ec50f75..000000000 --- a/scala/qscript/oneoffs/kshakir/UGMemoryTests.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -STING_HOME=/humgen/gsa-hpprojects/dev/kshakir/src/Sting_patches -TMP_DIR=/broad/shptmp/kshakir -JOB_QUEUE=gsa - -if [ "$1" == "debug" ]; then - JAVA_DEBUG="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8555" - shift -fi - -java $JAVA_DEBUG -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobPrefix QTest -S "$STING_HOME"/scala/qscript/kshakir/UGMemoryTests.scala -Y UGMemoryTests.yaml -gatk "$STING_HOME"/dist/GenomeAnalysisTK.jar -jobQueue $JOB_QUEUE $@ diff --git a/scala/qscript/oneoffs/kshakir/UGMemoryTests.yaml b/scala/qscript/oneoffs/kshakir/UGMemoryTests.yaml deleted file mode 100644 index 8f5dfa310..000000000 --- a/scala/qscript/oneoffs/kshakir/UGMemoryTests.yaml +++ /dev/null @@ -1,610 +0,0 @@ -{ - project: { - name: UGMemoryTests, - referenceFile: /seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta, - dbsnpFile: /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod, - intervalList: /humgen/gsa-hpprojects/GATK/data/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr1.interval_list - }, - samples: [ - { - id: C315_32742, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/32742/v3/32742.bam } - }, - { - id: C315_28-0154, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/28-0154/v3/28-0154.bam } - }, - { - id: C315_A08694, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/A08694/v1/A08694.bam } - }, - { - id: C315_9218, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/9218/v3/9218.bam } - }, - { - id: C315_42284, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/42284/v2/42284.bam } - }, - { - id: C315_395607-59, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/395607-59/v1/395607-59.bam } - }, - { - id: C315_12751, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/12751/v2/12751.bam } - }, - { - id: C315_A02027, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/A02027/v5/A02027.bam } - }, - { - id: C315_389822-58, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/389822-58/v1/389822-58.bam } - }, - { - id: C315_15899, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/15899/v1/15899.bam } - }, - { - id: C315_47661, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/47661/v3/47661.bam } - }, - { - id: C315_209541-66, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/209541-66/v1/209541-66.bam } - }, - { - id: C315_49535, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/49535/v2/49535.bam } - }, - { - id: C315_496560-33, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/496560-33/v1/496560-33.bam } - }, - { - id: C315_4039, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/4039/v2/4039.bam } - }, - { - id: C315_492677-36, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/492677-36/v1/492677-36.bam } - }, - { - id: C315_40716, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/40716/v2/40716.bam } - }, - { - id: C315_38201, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/38201/v2/38201.bam } - }, - { - id: C315_500277-48, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/500277-48/v1/500277-48.bam } - }, - { - id: C315_22866, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/22866/v1/22866.bam } - }, - { - id: C315_507365-44, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/507365-44/v2/507365-44.bam } - }, - { - id: C315_407001-34, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/407001-34/v1/407001-34.bam } - }, - { - id: C315_51248, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/51248/v1/51248.bam } - }, - { - id: C315_427532-47, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/427532-47/v1/427532-47.bam } - }, - { - id: C315_6767, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/6767/v2/6767.bam } - }, - { - id: C315_52221, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/52221/v2/52221.bam } - }, - { - id: C315_14779, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/14779/v1/14779.bam } - }, - { - id: C315_19309, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/19309/v1/19309.bam } - }, - { - id: C315_497395-47, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/497395-47/v1/497395-47.bam } - }, - { - id: C315_50333, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/50333/v1/50333.bam } - }, - { - id: C315_472444-60, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/472444-60/v2/472444-60.bam } - }, - { - id: C315_548668-34, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/548668-34/v1/548668-34.bam } - }, - { - id: C315_335840-68, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/335840-68/v1/335840-68.bam } - }, - { - id: C315_265276-65, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/265276-65/v3/265276-65.bam } - }, - { - id: C315_17480, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/17480/v1/17480.bam } - }, - { - id: C315_426521-75, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/426521-75/v1/426521-75.bam } - }, - { - id: C315_222034-64, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/222034-64/v1/222034-64.bam } - }, - { - id: C315_pcath980626-1, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/pcath980626-1/v3/pcath980626-1.bam } - }, - { - id: C315_527830-33, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/527830-33/v1/527830-33.bam } - }, - { - id: C315_421826-53, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/421826-53/v1/421826-53.bam } - }, - { - id: C315_217094-74, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/217094-74/v3/217094-74.bam } - }, - { - id: C315_562474-57, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/562474-57/v1/562474-57.bam } - }, - { - id: C315_434049-48, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/434049-48/v1/434049-48.bam } - }, - { - id: C315_360268-49, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/360268-49/v2/360268-49.bam } - }, - { - id: C315_528492-65, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/528492-65/v1/528492-65.bam } - }, - { - id: C315_206691-53, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/206691-53/v1/206691-53.bam } - }, - { - id: C315_19156, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/19156/v1/19156.bam } - }, - { - id: C315_364827-70, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/364827-70/v1/364827-70.bam } - }, - { - id: C315_544273-42, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/544273-42/v1/544273-42.bam } - }, - { - id: C315_41645, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/41645/v1/41645.bam } - }, - { - id: C315_39048, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/39048/v1/39048.bam } - }, - { - id: C315_14007, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/14007/v2/14007.bam } - }, - { - id: C315_395725-33, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/395725-33/v1/395725-33.bam } - }, - { - id: C315_42291, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/42291/v1/42291.bam } - }, - { - id: C315_31981, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/31981/v2/31981.bam } - }, - { - id: C315_87A84DD1, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/87A84DD1/v2/87A84DD1.bam } - }, - { - id: C315_54393, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/54393/v1/54393.bam } - }, - { - id: C315_15974, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/15974/v1/15974.bam } - }, - { - id: C315_543091-49, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/543091-49/v2/543091-49.bam } - }, - { - id: C315_283916-44, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/283916-44/v1/283916-44.bam } - }, - { - id: C315_49900, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/49900/v2/49900.bam } - }, - { - id: C315_460187-33, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/460187-33/v2/460187-33.bam } - }, - { - id: C315_48019, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/48019/v2/48019.bam } - }, - { - id: C315_329427-69, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/329427-69/v1/329427-69.bam } - }, - { - id: C315_A06518, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/A06518/v1/A06518.bam } - }, - { - id: C315_35484, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/35484/v5/35484.bam } - }, - { - id: C315_325920-37, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/325920-37/v1/325920-37.bam } - }, - { - id: C315_25775, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/25775/v1/25775.bam } - }, - { - id: C315_202228-58, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/202228-58/v1/202228-58.bam } - }, - { - id: C315_542914-48, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/542914-48/v3/542914-48.bam } - }, - { - id: C315_36047, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/36047/v2/36047.bam } - }, - { - id: C315_232846-59, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/232846-59/v1/232846-59.bam } - }, - { - id: C315_5760, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/5760/v1/5760.bam } - }, - { - id: C315_348907-53, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/348907-53/v1/348907-53.bam } - }, - { - id: C315_8891, - bamFiles: { recalibrated: /seq/picard_aggregation/C315/8891/v2/8891.bam } - }, - { - id: C338_00164219, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00164219/v3/00164219.bam } - }, - { - id: C338_00339745, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339745/v2/00339745.bam } - }, - { - id: C338_00339753, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339753/v2/00339753.bam } - }, - { - id: C338_00347320, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347320/v1/00347320.bam } - }, - { - id: C338_00341953, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00341953/v2/00341953.bam } - }, - { - id: C338_00347335, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347335/v2/00347335.bam } - }, - { - id: C338_00347323, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347323/v2/00347323.bam } - }, - { - id: C338_00313755, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313755/v2/00313755.bam } - }, - { - id: C338_00344108, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00344108/v1/00344108.bam } - }, - { - id: C338_00313306, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313306/v1/00313306.bam } - }, - { - id: C338_00341959, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00341959/v3/00341959.bam } - }, - { - id: C338_00344030, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00344030/v3/00344030.bam } - }, - { - id: C338_00344099, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00344099/v3/00344099.bam } - }, - { - id: C338_00339767, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339767/v2/00339767.bam } - }, - { - id: C338_00347317, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347317/v3/00347317.bam } - }, - { - id: C338_00338716, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00338716/v2/00338716.bam } - }, - { - id: C338_00314085, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314085/v2/00314085.bam } - }, - { - id: C338_00339707, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339707/v2/00339707.bam } - }, - { - id: C338_00342149, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00342149/v2/00342149.bam } - }, - { - id: C338_00339680, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339680/v3/00339680.bam } - }, - { - id: C338_00314089, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314089/v1/00314089.bam } - }, - { - id: C338_00347305, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347305/v3/00347305.bam } - }, - { - id: C338_00347299, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347299/v2/00347299.bam } - }, - { - id: C338_00314127, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314127/v4/00314127.bam } - }, - { - id: C338_00314042, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314042/v1/00314042.bam } - }, - { - id: C338_00313624, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313624/v2/00313624.bam } - }, - { - id: C338_00347929, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347929/v2/00347929.bam } - }, - { - id: C338_00340223, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00340223/v3/00340223.bam } - }, - { - id: C338_00314130, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314130/v2/00314130.bam } - }, - { - id: C338_00342001, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00342001/v3/00342001.bam } - }, - { - id: C338_00313906, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313906/v2/00313906.bam } - }, - { - id: C338_00313844, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313844/v2/00313844.bam } - }, - { - id: C338_00153519, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00153519/v1/00153519.bam } - }, - { - id: C338_00071493, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00071493/v2/00071493.bam } - }, - { - id: C338_00314083, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314083/v2/00314083.bam } - }, - { - id: C338_00334568, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00334568/v6/00334568.bam } - }, - { - id: C338_00346347, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00346347/v3/00346347.bam } - }, - { - id: C338_00180648, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00180648/v1/00180648.bam } - }, - { - id: C338_00187275, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00187275/v1/00187275.bam } - }, - { - id: C338_00346283, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00346283/v3/00346283.bam } - }, - { - id: C338_00313933, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313933/v5/00313933.bam } - }, - { - id: C338_00313479, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313479/v2/00313479.bam } - }, - { - id: C338_00313422, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313422/v1/00313422.bam } - }, - { - id: C338_00153459, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00153459/v2/00153459.bam } - }, - { - id: C338_00340147, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00340147/v3/00340147.bam } - }, - { - id: C338_00308255, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00308255/v3/00308255.bam } - }, - { - id: C338_00341944, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00341944/v3/00341944.bam } - }, - { - id: C338_00314081, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314081/v2/00314081.bam } - }, - { - id: C338_00339729, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339729/v2/00339729.bam } - }, - { - id: C338_00340121, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00340121/v2/00340121.bam } - }, - { - id: C338_00164078, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00164078/v1/00164078.bam } - }, - { - id: C338_00314037, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314037/v2/00314037.bam } - }, - { - id: C338_00313708, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313708/v3/00313708.bam } - }, - { - id: C338_00346266, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00346266/v3/00346266.bam } - }, - { - id: C338_00313914, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313914/v1/00313914.bam } - }, - { - id: C338_00340093, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00340093/v2/00340093.bam } - }, - { - id: C338_00313909, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313909/v1/00313909.bam } - }, - { - id: C338_00347739, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347739/v2/00347739.bam } - }, - { - id: C338_00338680, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00338680/v3/00338680.bam } - }, - { - id: C338_00347283, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347283/v3/00347283.bam } - }, - { - id: C338_00180679, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00180679/v1/00180679.bam } - }, - { - id: C338_00313247, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313247/v1/00313247.bam } - }, - { - id: C338_00174844, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00174844/v2/00174844.bam } - }, - { - id: C338_00313450, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313450/v1/00313450.bam } - }, - { - id: C338_00313626, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313626/v2/00313626.bam } - }, - { - id: C338_00313311, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313311/v2/00313311.bam } - }, - { - id: C338_00313988, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313988/v2/00313988.bam } - }, - { - id: C338_00314078, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00314078/v2/00314078.bam } - }, - { - id: C338_00313721, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313721/v2/00313721.bam } - }, - { - id: C338_00347894, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00347894/v2/00347894.bam } - }, - { - id: C338_00329142, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00329142/v1/00329142.bam } - }, - { - id: C338_00313304, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00313304/v1/00313304.bam } - }, - { - id: C338_00334599, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00334599/v2/00334599.bam } - }, - { - id: C338_00339674, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00339674/v2/00339674.bam } - }, - { - id: C338_00234650, - bamFiles: { recalibrated: /seq/picard_aggregation/C338/00234650/v1/00234650.bam } - } - ] -} diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.scala b/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.scala deleted file mode 100644 index 1b23c110d..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.scala +++ /dev/null @@ -1,68 +0,0 @@ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.utils.text._ -import org.broadinstitute.sting.queue.extensions.gatk._ -import collection.JavaConversions._ - -class LinearIndexBinTests extends QScript { - qscript => - - @Input(doc="The path to the GenomeAnalysisTK.jar file.", shortName="gatk") - var gatkJar: File = null - - @Argument(doc="Rod list to test. The first line in the list is the reference.", shortName="BL") - var rodLists: List[File] = Nil - - @Argument(doc="Number of times to run the test.", shortName="numRuns", required=false) - var numRuns = 1 - - @Input(doc="memory limits", shortName="mem", required=true) - var memoryLimits: List[Int] = Nil - - @Input(doc="max bin size", shortName="maxBin", required=false) - var maxBinSize = 512 - - def script = { - var maxFeaturesPerBin = List.empty[String] - var currMaxFeatures = maxBinSize - while (currMaxFeatures > 1) { - maxFeaturesPerBin +:= currMaxFeatures.toString - currMaxFeatures /= 2 - } - maxFeaturesPerBin :::= List("0.001", "0.01", "0.1", "1") - - for (run <- 1 to numRuns) { - for (rodList <- rodLists) { - val rodListName = rodList.getName - val lines = new XReadLines(rodList).iterator - val reference = new File(lines.next) - val rodFiles = lines.map(path => new File(path)).toList - - for (memoryLimit <- memoryLimits) { - for (maxFeatures <- maxFeaturesPerBin) { - val dir = "%s_%smfpb_%02dg_run%02d".format(rodListName, "00000".take(5-maxFeatures.length) + maxFeatures, memoryLimit, run) - - val countRod = new CountRod { - override def javaOpts = super.javaOpts + " -DMAX_FEATURES_PER_BIN=" + maxFeatures - } - countRod.jobOutputFile = new File(dir, "CountRod.out") - countRod.out = new File(dir, "CountRod.txt") - - countRod.jarFile = qscript.gatkJar - countRod.reference_sequence = reference - countRod.memoryLimit = memoryLimit - - // Some of the BED files don't have a chrM, which makes the GATK angry. Run unsafe. - countRod.U = org.broadinstitute.sting.gatk.arguments.ValidationExclusion.TYPE.ALL - - for ((rodFile, index) <- rodFiles.zipWithIndex) { - val rodType = rodFile.getName.split("\\.").last - countRod.rodBind :+= new RodBind(rodType + (index+1), rodType.toUpperCase, rodFile.getAbsolutePath) - } - - add(countRod) - } - } - } - } - } -} diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.sh b/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.sh deleted file mode 100755 index 82a0cc929..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/LinearIndexBinTests.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -STING_HOME=/humgen/gsa-hpprojects/dev/kshakir/src/Sting_CountRod -TMP_DIR=/broad/shptmp/kshakir/CountRod -JOB_QUEUE= -STATUS_TO= -#JOB_QUEUE="-jobQueue week" -#STATUS_TO="-statusTo kshakir" - -if [ "$1" == "debug" ]; then - JAVA_DEBUG="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8555" - shift -fi - -java $JAVA_DEBUG -Djava.io.tmpdir=${TMP_DIR} -jar ${STING_HOME}/dist/Queue.jar -jobPrefix QCountRodTest -S LinearIndexBinTests.scala -gatk ${STING_HOME}/dist/GenomeAnalysisTK.jar -jobQueue ${JOB_QUEUE} ${STATUS_TO} -bsub $@ diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/README.txt b/scala/qscript/oneoffs/kshakir/linearindexbintests/README.txt deleted file mode 100644 index 515c8f48b..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/README.txt +++ /dev/null @@ -1,54 +0,0 @@ -DESCRIPTION ------------ - -This folder contains a set of test scripts for evaluating the MAX_FEATURES_PER_BIN setting in tribble/src/org/broad/tribble/index/linear/LinearIndex.java - -For the tests to work you must patch the tribble code to enable the MAX_FEATURES_PER_BIN to be set via a system property, for example: - java -jar GenomeAnalysisTK.jar -DMAX_FEATURES_PER_BIN=1 ... - - -SCRIPTS -------- - -*** LinearIndexBinTests.sh *** - -Runs the scala script LinearIndexBinTests.scala. Requires that you pass rods via "rod lists" (see below) and specify how much memory to run each set of tests with. - -Example dry run: - ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs - -Example run: - ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -run - -Example run on the hour queue: - ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -run - -Example run on the hour queue with each job run three times: - ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -numRuns 3 -run - -*** grep_results.sh *** - -Greps the CPU and Max Memory statistics from the LSF output files into a file mfpb.txt. - -Example: - ./grep_results.sh - [outputs: ./mfpb.txt] - -*** plot_results.R *** - -Creates a plot from a subset of the data in mfpb.txt. Can be run multiple times to produces plots for the different memory limits passed to LinearIndexBinTests.sh - -Example: - ./plot_results.R mfpb.txt 2 - ./plot_results.R mfpb.txt 4 - ./plot_results.R mfpb.txt 6 - [outputs: ./max_features_per_bin_Xmx2g.pdf ./max_features_per_bin_Xmx4g.pdf ./max_features_per_bin_Xmx6g.pdf] - - -ROD LISTS ---------- - -A rod list is a file that contains the FASTA reference on the first line, and then 1..N ROD files in the rest of the file. The RODs must all end with an extension that corresponds to the rod type, for example: .vcf, .bed, etc. - -Example: - [See test_vcfs] diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/grep_results.sh b/scala/qscript/oneoffs/kshakir/linearindexbintests/grep_results.sh deleted file mode 100755 index 91aa41750..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/grep_results.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - -DIRECTORY=. - -find $DIRECTORY -name CountRod.out -exec grep -H "Max Memory" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * Max Memory *: *\([.0-9]*\) MB!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/memory.txt - -find $DIRECTORY -name CountRod.out -exec grep -H "CPU" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * CPU time *: *\([.0-9]*\) sec.!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/cpu.txt - -find $DIRECTORY -name \*.done -o -name \*.fail | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/\.CountRod.txt\.*\(.*\)!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/success.txt - -TAB=" " -echo "set${TAB}max_features_per_bin${TAB}memory_limit_gb${TAB}run_number${TAB}max_memory_mb${TAB}cpu_s${TAB}job_success" > $DIRECTORY/mfpb.txt -paste $DIRECTORY/memory.txt $DIRECTORY/cpu.txt $DIRECTORY/success.txt | cut -f 1-5,10,15 >> $DIRECTORY/mfpb.txt diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/plot_results.R b/scala/qscript/oneoffs/kshakir/linearindexbintests/plot_results.R deleted file mode 100755 index 2d6d44504..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/plot_results.R +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/env Rscript - -require(lattice) -require(sqldf) - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -memory = args[2] - -mfpb_data <- read.table(input, head=T) -mfpb_data <- mfpb_data[order(mfpb_data$set, mfpb_data$max_features_per_bin, mfpb_data$memory_limit_gb, mfpb_data$run_number) , ] -mfpb_data <- sqldf("select \"set\", max_features_per_bin, memory_limit_gb, avg(cpu_s) as cpu_s, avg(max_memory_mb) as max_memory_mb from mfpb_data where job_success = 'done' group by \"set\", max_features_per_bin, memory_limit_gb") - -outfile = paste("max_features_per_bin_Xmx", memory, "g.pdf", sep="") -pdf(outfile, height=7, width=14) - -par(cex=1.3) - -xyplot(max_memory_mb + cpu_s ~ log10(max_features_per_bin), groups = set, data = subset(mfpb_data, memory_limit_gb == memory), type="b", scales=list(relation="free"), auto.key=T) - -dev.off() diff --git a/scala/qscript/oneoffs/kshakir/linearindexbintests/test_vcfs b/scala/qscript/oneoffs/kshakir/linearindexbintests/test_vcfs deleted file mode 100644 index 9296ec2ed..000000000 --- a/scala/qscript/oneoffs/kshakir/linearindexbintests/test_vcfs +++ /dev/null @@ -1,3 +0,0 @@ -/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta -/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_HG01359/HG01359.cleaned.indels.vcf -/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_NA19670/NA19670.cleaned.indels.vcf diff --git a/scala/qscript/oneoffs/rpoplin/ASHGcalling.scala b/scala/qscript/oneoffs/rpoplin/ASHGcalling.scala deleted file mode 100755 index 90803c725..000000000 --- a/scala/qscript/oneoffs/rpoplin/ASHGcalling.scala +++ /dev/null @@ -1,190 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class ASHGcalling extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="the chromosome to process", shortName="chr", required=true) - var chr: Int = _ - - @Input(doc="output path", shortName="outputDir", required=false) - var outputDir: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/" - - @Input(doc="base output filename", shortName="baseName", required=false) - var baseName: String = "ALL.august" - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) - var outputTmpDir: String = "/humgen/gsa-hpprojects/august_cleaned_bams" - - private val tmpDir: File = new File("/broad/shptmp/rpoplin/") - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - private val targetIntervals: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/knownIndels.intervals") - private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" - private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - val populations = List("YRI","LWK","ASW","PUR","CEU","TSI","GBR","FIN","MXL","CHB","CHS","JPT") - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = 2 - this.DBSNP = qscript.dbSNP - this.jobTempDir = qscript.tmpDir - } - - class SamtoolsBaqFunction extends CommandLineFunction { - @Input(doc="The input BAM file") var in_bam: File = _ - @Output(doc="The output BAM file") var out_bam: File = _ - def commandLine = "/humgen/gsa-scr1/rpoplin/samtools/samtools calmd -br %s %s > %s".format(in_bam.getAbsolutePath, qscript.reference, out_bam.getAbsolutePath) - } - - class DeleteMeFunction extends CommandLineFunction { - @Input(doc="The file to be deleted") var me: File = _ - @Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _ - def commandLine = "rm -f %s".format(me.getAbsolutePath) - } - - class DeleteMeAllFunction extends CommandLineFunction { - @Input(doc="The file to be deleted") var me: File = _ - @Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _ - def commandLine = "rm -f %s*".format(me.getAbsolutePath) - } - - def script = { - val basesPerJob: Int = 3000000 - val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1) - var start: Int = 1 - var stop: Int = start - 1 + basesPerJob - if( stop > lastBase ) { stop = lastBase } - var jobNumber: Int = 1 - while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { - callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber) - start += basesPerJob - stop += basesPerJob - if( stop > lastBase ) { stop = lastBase } - jobNumber += 1 - } - - /* CombineVariants parses the 800+ genotypes per record and is way too slow. Combine the vcf files together using grep, cat, and sortByRef.pl outside of Queue - combineVariants = new CombineVariants with CommandLineGATKArgs - combineVariants.rodBind = vcfChunks - combineVariants.out = new TaggedFile(qscript.baseName + ".chr" + qscript.chr.toString + ".filtered.vcf", "vcf") - combineVariants.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION - combineVariants.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED - combineVariants.setKey = "null" - add(combineVariants) - */ - } - - def callThisChunk(interval: String, jobNumber: Int) = { - - val baseName: String = qscript.outputDir + "/chr" + qscript.chr.toString + "/" + qscript.baseName + ".chr" + qscript.chr.toString + "." + jobNumber.toString +"." - var call = new UnifiedGenotyperV2 with CommandLineGATKArgs - val rawCalls = new File(baseName + "raw.vcf") - - for( population <- qscript.populations ) { - val baseTmpName: String = qscript.outputTmpDir + "/chr" + qscript.chr.toString + "/" + population + ".august.chr" + qscript.chr.toString + "." + jobNumber.toString +"." - val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/%s.chr%d.bam.list".format(population, qscript.chr)) - - // 1.) Clean at known indels - var clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + "cleaned.bam") - clean.memoryLimit = 4 - clean.input_file :+= bamList - clean.intervalsString :+= interval - clean.targetIntervals = qscript.targetIntervals - clean.out = cleanedBam - clean.rodBind :+= RodBind("indels", "VCF", qscript.dindelCalls) - clean.knownsOnly = true - clean.LOD = 1.0 - clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true - clean.compress = 2 - clean.jobName = baseName + population + ".clean" - //clean.stripBam = true - //clean.fileSystemUsage = "indium" - - // 2.) Apply BAQ calculation - var baq = new SamtoolsBaqFunction - val baqedBam = new File(baseTmpName + "cleaned.baq.bam") - baq.memoryLimit = 4 - baq.in_bam = cleanedBam - baq.out_bam = baqedBam - baq.jobName = baseName + population + ".baq" - //baq.fileSystemUsage = "iodine" - - // 3a.) Delete cleaned bam - var deleteClean = new DeleteMeFunction - deleteClean.me = cleanedBam - deleteClean.trigger = baqedBam - deleteClean.jobName = baseName + population + ".deleteClean" - //deleteClean.fileSystemUsage = "iodine" - - // 3b.) Index BAQ'ed bam - var index = new SamtoolsIndexFunction - index.bamFile = baqedBam - index.jobName = baseName + population + ".index" - //index.fileSystemUsage = "iodine" - - // 5a.) Delete BAQ'ed bam and index - //var deleteBaq = new DeleteMeAllFunction - //deleteBaq.me = baqedBam - //deleteBaq.trigger = rawCalls - //deleteBaq.jobName = baseName + population + ".deleteBaq" - //deleteBaq.fileSystemUsage = "iodine" - - call.input_file :+= baqedBam - - //add(clean, baq, deleteClean, index, deleteBaq) - add(clean, baq, deleteClean, index) - } - - // 4.) Call with UGv2 - call.memoryLimit = 4 - call.intervalsString :+= interval - call.out = rawCalls - call.dcov = 50 - call.standard_min_confidence_threshold_for_calling = 50 - call.standard_min_confidence_threshold_for_emitting = 30 - call.min_mapping_quality_score = 20 - call.min_base_quality_score = 20 - call.pnrm = org.broadinstitute.sting.playground.gatk.walkers.genotyper.AlleleFrequencyCalculationModel.Model.GRID_SEARCH - call.jobName = baseName + "call" - //call.fileSystemUsage = "iodine" - - // 5b.) Filter near indels and HARD_TO_VALIDATE - var filter = new VariantFiltration with CommandLineGATKArgs - val filteredCalls = new File(baseName + "filtered.vcf") - filter.memoryLimit = 1 - filter.out = filteredCalls - filter.intervalsString :+= interval - filter.variantVCF = rawCalls - filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask) - filter.maskName = "InDel" - filter.filterName ++= List("HARD_TO_VALIDATE") - filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - filter.jobName = baseName + "filter" - //filter.fileSystemUsage = "indium" - - // 6.) Delete raw calls and index - var deleteRawCalls = new DeleteMeAllFunction - deleteRawCalls.me = rawCalls - deleteRawCalls.trigger = filteredCalls - deleteRawCalls.jobName = baseName + "deleteRawCalls" - //deleteRawCalls.fileSystemUsage = "indium" - - add(call, filter, deleteRawCalls) - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/rpoplin/Phase1Calling.scala b/scala/qscript/oneoffs/rpoplin/Phase1Calling.scala deleted file mode 100755 index afa547e35..000000000 --- a/scala/qscript/oneoffs/rpoplin/Phase1Calling.scala +++ /dev/null @@ -1,134 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class Phase1Calling extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="the chromosome to process", shortName="chr", required=false) - var chr: Int = 20 - - @Input(doc="output path", shortName="outputDir", required=false) - var outputDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" - - @Input(doc="base output filename", shortName="baseName", required=false) - var baseName: String = "" - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) - var outputTmpDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" - - private val tmpDir: File = new File("/broad/shptmp/rpoplin/tmp/") - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - private val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" - private val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/AFR.dindel_august_release.20110110.sites.vcf.gz" - private val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/ASN.dindel_august_release.20110110.sites.vcf.gz" - private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" - private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed" - val hapmap = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" - val g1k = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf" - val omni = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","JPT","LWK","MXL","PUR","TSI","YRI") - //val populations = List("JPT","ASN","AMR") - //val populations = List("EUR","AMR","ASN","AFR") - //val populations = List("FIN", "LWK") - private val intervals: String = "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals" - //val populations = List("ZZZ") // small set used for debugging - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = 3 - this.jobTempDir = qscript.tmpDir - this.DBSNP = qscript.dbSNP - } - - def script = { - callThisChunk() // using scatter/gather capabilities of Queue so no need to for loop over 1Mb chunks of the chromosome - } - - def callThisChunk() = { - - val interval = "%d".format(qscript.chr) - for( population <- qscript.populations ) { - val baseName: String = qscript.outputDir + "/" + population + ".phase1.chr" + qscript.chr.toString - var bamList: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/%s.phase1.chr%d.cleaned.bam".format(population, qscript.chr)) - if( population == "ASN" || population == "EUR" || population == "AFR" || population == "AMR" ) { - bamList = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/%s.chr%d.cleaned.list".format(population, qscript.chr)) - } - - val rawCalls = new File(baseName + ".raw.vcf") - val filteredCalls = new File(baseName + ".filtered.vcf") - val clusterFile = new File(baseName + ".omni.clusters") - val recalibratedCalls = new File(baseName + ".recal.vcf") - val tranchesFile = new File(baseName + ".ts.omni.tranches") - - var call = new UnifiedGenotyper with CommandLineGATKArgs - call.intervalsString ++= List(qscript.intervals) - call.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 - call.dcov = 50 - call.stand_call_conf = 4.0 - call.stand_emit_conf = 4.0 - call.input_file :+= bamList - call.out = rawCalls - call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY - call.analysisName = baseName + "_UG" - - var filter = new VariantFiltration with CommandLineGATKArgs - filter.intervalsString ++= List(qscript.intervals) - filter.scatterCount = 10 - filter.variantVCF = rawCalls - filter.out = filteredCalls - filter.filterName ++= List("HARD_TO_VALIDATE") - filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - filter.analysisName = baseName + "_VF" - //filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask) - //filter.maskName = "InDel" - - var gvc = new GenerateVariantClusters with CommandLineGATKArgs - gvc.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) - gvc.rodBind :+= RodBind("1kg", "VCF", qscript.omni) - gvc.rodBind :+= RodBind("input", "VCF", filteredCalls ) - gvc.clusterFile = clusterFile - gvc.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - gvc.analysisName = baseName + "_GVC" - gvc.intervalsString ++= List(qscript.intervals) - gvc.qual = 100 // clustering parameters to be updated soon pending new experimentation results - gvc.std = 4.5 - gvc.mG = 6 - - var vr = new VariantRecalibrator with CommandLineGATKArgs - vr.rodBind :+= RodBind("1kg", "VCF", qscript.omni) - vr.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) - vr.rodBind :+= RodBind("truthOmni", "VCF", qscript.omni) - vr.rodBind :+= RodBind("truthHapMap", "VCF", qscript.hapmap) - vr.rodBind :+= RodBind("input", "VCF", filteredCalls ) - vr.clusterFile = clusterFile - vr.analysisName = baseName + "_VR" - vr.intervalsString ++= List(qscript.intervals) - vr.ignoreFilter ++= List("HARD_TO_VALIDATE") - vr.target_titv = 2.3 - vr.sm = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY - vr.tranche ++= List("0.1", "1.0", "2.0", "3.0", "5.0", "10.0", "100.0") - vr.out = recalibratedCalls - vr.priorDBSNP = 10.0 - vr.priorHapMap = 12.0 - vr.prior1KG = 12.0 - vr.tranchesFile = tranchesFile - - add(call, filter, gvc, vr) - } - - } -} \ No newline at end of file diff --git a/scala/qscript/oneoffs/rpoplin/Phase1Cleaning.scala b/scala/qscript/oneoffs/rpoplin/Phase1Cleaning.scala deleted file mode 100755 index 9694e44d5..000000000 --- a/scala/qscript/oneoffs/rpoplin/Phase1Cleaning.scala +++ /dev/null @@ -1,97 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class Phase1Cleaning extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="the chromosome to process", shortName="chr", required=false) - var chr: Int = 20 - - @Input(doc="output path", shortName="outputDir", required=false) - var outputDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" - - @Input(doc="base output filename", shortName="baseName", required=false) - var baseName: String = "" - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) - var outputTmpDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" - - private val tmpDir: File = new File("/broad/shptmp/rpoplin/tmp/") - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" - private val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/AFR.dindel_august_release.20110110.sites.vcf.gz" - private val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/ASN.dindel_august_release.20110110.sites.vcf.gz" - private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) - val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","JPT","LWK","MXL","PUR","TSI","YRI") - //val populations = List("ZZZ") // small set used for debugging - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = 2 - this.jobTempDir = qscript.tmpDir - } - - def script = { - callThisChunk() // using scatter/gather capabilities of Queue so no need to for loop over 1Mb chunks of the chromosome - } - - - def callThisChunk() = { - - val interval = "%d".format(qscript.chr) - for( population <- qscript.populations ) { - val baseTmpName: String = qscript.outputTmpDir + "/" + population + ".phase1.chr" + qscript.chr.toString + "." - val bamList: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.bam.lists/%s.chr%d.list".format(population, qscript.chr)) - val targetIntervals: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/intervals/%s.chr%d.intervals".format(population, qscript.chr)) - - // 1.) Create cleaning targets - var target = new RealignerTargetCreator with CommandLineGATKArgs - target.memoryLimit = 4 - target.input_file :+= bamList - target.intervalsString :+= interval - target.out = targetIntervals - target.mismatchFraction = 0.0 - target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) - target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelPilotCalls) - target.rodBind :+= RodBind("indels2", "VCF", qscript.dindelAFRCalls) - target.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) - target.rodBind :+= RodBind("indels4", "VCF", qscript.dindelASNCalls) - target.jobName = baseName + population + ".target" - - // 2.) Clean without SW - var clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + "cleaned.bam") - clean.memoryLimit = 4 - clean.input_file :+= bamList - clean.intervalsString :+= interval - clean.targetIntervals = targetIntervals - clean.out = cleanedBam - clean.doNotUseSW = true - clean.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE - clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) - clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelPilotCalls) - clean.rodBind :+= RodBind("indels2", "VCF", qscript.dindelAFRCalls) - clean.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) - clean.rodBind :+= RodBind("indels4", "VCF", qscript.dindelASNCalls) - clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true - clean.jobName = baseName + population + ".clean" - - add(target, clean) - } - - } -} diff --git a/scala/qscript/oneoffs/rpoplin/Phase1ProjectConsensus.scala b/scala/qscript/oneoffs/rpoplin/Phase1ProjectConsensus.scala deleted file mode 100755 index aa4799850..000000000 --- a/scala/qscript/oneoffs/rpoplin/Phase1ProjectConsensus.scala +++ /dev/null @@ -1,172 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils - -class Phase1ProjectConsensus extends QScript { - qscript => - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="output path", shortName="outputDir", required=true) - var outputDir: String = _ - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true) - var outputTmpDir: String = _ - - private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" - val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560) - val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI") - private val alleles: String = "/humgen/1kg/processing/production_wgs_phase1/consensus/ALL.phase1.wgs.union.pass.sites.vcf" - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = Some(3) - } - - class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int, val chr: String) { - val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + chr + "/" + baseName + "/" + baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf") - - val callSnps = new UnifiedGenotyper with CommandLineGATKArgs - callSnps.out = rawVCFsnps - callSnps.dcov = 50 - callSnps.stand_call_conf = 4.0 - callSnps.stand_emit_conf = 4.0 - callSnps.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE - callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" +baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps" - callSnps.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP - callSnps.genotyping_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - //callSnps.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES - callSnps.rodBind :+= RodBind("alleles", "VCF", qscript.alleles) - callSnps.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - callSnps.sites_only = true - } - - class Chromosome(val inputChr: Int) { - var chr: String = inputChr.toString - if(inputChr == 23) { chr = "X" } - - val combine = new CombineVariants with CommandLineGATKArgs - val chrVCF = new File(qscript.outputDir + "/calls/" + "combined.phase1.chr" + chr + ".raw.snps.vcf") - combine.out = chrVCF - combine.intervalsString :+= chr - } - - def script = { - - for(chr <- List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)) { - val chrObject = new Chromosome(chr) - val basesPerJob: Int = 3000000 - val lastBase: Int = qscript.chromosomeLength(chr - 1) - var start: Int = 1 - var stop: Int = start - 1 + basesPerJob - if( stop > lastBase ) { stop = lastBase } - var jobNumber: Int = 1 - while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { - if( chr != 23 ) { - callThisChunk("%d:%d-%d".format(chr, start, stop), jobNumber, chr, chrObject) - } else { - callThisChunk("X:%d-%d".format(start, stop), jobNumber, chr, chrObject) - } - start += basesPerJob - stop += basesPerJob - if( stop > lastBase ) { stop = lastBase } - jobNumber += 1 - } - add(chrObject.combine) - } - } - - - def callThisChunk(interval: String, jobNumber: Int, inputChr: Int, chrObject: Chromosome) = { - - var chr: String = inputChr.toString - if(inputChr == 23) { chr = "X" } - - val AFRadmix = new AnalysisPanel("AFR.admix", List("LWK","YRI","ASW","CLM","PUR"), jobNumber, chr) - val AMRadmix = new AnalysisPanel("AMR.admix", List("MXL","CLM","PUR","ASW"), jobNumber, chr) - val EURadmix = new AnalysisPanel("EUR.admix", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber, chr) - val ASNadmix = new AnalysisPanel("ASN.admix", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber, chr) - val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW"), jobNumber, chr) - val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR"), jobNumber, chr) - val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS"), jobNumber, chr) - val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT"), jobNumber, chr) - val ALL = new AnalysisPanel("ALL", List("LWK","YRI","ASW","MXL","CLM","PUR","CEU","FIN","GBR","TSI","IBS","CHB","CHS","JPT"), jobNumber, chr) - - val analysisPanels = List(AFR, ASN, AMR, EUR, AFRadmix, ASNadmix, AMRadmix, EURadmix) //ALL - - val combine = new CombineVariants with CommandLineGATKArgs - val combinedChunk = new File(qscript.outputDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf") - - combine.out = combinedChunk - combine.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps" - combine.intervalsString :+= interval - combine.mergeInfoWithMaxAC = true - combine.priority = "AFR.admix,AMR.admix,EUR.admix,ASN.admix,AFR,AMR,EUR,ASN" //ALL, - - for( population <- qscript.populations ) { - val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + chr + "/" + population + ".phase1.chr" + chr + "." + jobNumber.toString + "." - val bamList: File = new File("/humgen/1kg/processing/production_wgs_phase1/bam_lists/%s.list".format(population)) - val targetIntervals: File = new File(baseTmpName + "target.intervals") - - // 1.) Create cleaning targets - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.memoryLimit = 4 - target.input_file :+= bamList - target.intervalsString :+= interval - target.out = targetIntervals - target.mismatchFraction = 0.0 - target.maxIntervalSize = 700 - target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - target.jobName = baseTmpName + "target" - //target.isIntermediate = true - target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - // 2.) Clean without SW - val clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + "cleaned.bam") - clean.memoryLimit = 6 - clean.input_file :+= bamList - clean.intervalsString :+= interval - clean.targetIntervals = targetIntervals - clean.out = cleanedBam - clean.doNotUseSW = true - clean.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - clean.simplifyBAM = true - clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - clean.jobName = baseTmpName + "clean" - //clean.isIntermediate = true - clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - add(target, clean) - - for( a <- analysisPanels ) { - for( p <- a.pops) { - if( p == population ) { - a.callSnps.input_file :+= cleanedBam - } - } - } - } - - for( a <- analysisPanels ) { - a.callSnps.intervalsString :+= interval - if(a.baseName == "ALL") { a.callSnps.memoryLimit = 4 } - add(a.callSnps) - combine.rodBind :+= RodBind(a.baseName, "VCF", a.callSnps.out) - } - - add(combine) - chrObject.combine.rodBind :+= RodBind("ALL" + jobNumber.toString, "VCF", combine.out) - } -} diff --git a/scala/qscript/oneoffs/rpoplin/VQSR_parameterSearch.scala b/scala/qscript/oneoffs/rpoplin/VQSR_parameterSearch.scala deleted file mode 100755 index 91e00e8f6..000000000 --- a/scala/qscript/oneoffs/rpoplin/VQSR_parameterSearch.scala +++ /dev/null @@ -1,362 +0,0 @@ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.QScript -import org.apache.commons.io.FilenameUtils; - -class VQSR_parameterSearch extends QScript { - qscript => - - @Argument(shortName="gatk", doc="gatk jar file", required=true) - var gatkJarFile: File = _ - - @Argument(shortName="experiment", doc="experiment number", required=true) - var experiment: String = "0000" - - @Argument(shortName="outputDir", doc="output directory", required=true) - var outputDir: String = "./" - - @Argument(shortName="skipCalling", doc="If true, skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false) - var skipCalling: Boolean = false - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = 2; } - - class Target(val baseName: String, val reference: File, val rodName: String, val bamList: File, val goldStandard_VCF: File, val intervals: String, val titvTarget: Double, val isLowpass: Boolean) { - def name = qscript.outputDir + baseName - def clusterFile = new File(name + ".clusters") - def rawVCF = new File(name + ".raw.vcf") - def filteredVCF = new File(name + ".filtered.vcf") - def goldStandardName = qscript.outputDir + "goldStandard/" + baseName - var goldStandardClusterFile: File = new File("") - var gaussian: Int = 1 - var shrinkage: Double = 1.0 - var dirichlet: Double = 1.0 - var backoff: Double = 1.0 - var qualCutoff: Int = 1 - var std: Double = 1.0 - var useQD: Int = 1 - var useSB: Int = 1 - var useHS: Int = 1 - var useHRUN: Int = 1 - var useMQRST: Int = 1 - var useBQRST: Int = 1 - var useGC: Int = 1 - var useMQ: Int = 1 - var useSumGL: Int = 1 - var trainOmni: Int = 1 - } - - val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") - val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - // Define the target datasets here - def lowPass = true - val HiSeq = new Target("NA12878.HiSeq", hg18, "hg18", // BUGBUG: cut down to chr1 - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, !lowPass) - val WEx = new Target("NA12878.WEx", hg18, "hg18", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass) - val LowPassN60 = new Target("lowpass.N60", b36, "b36", // which reference the data is aligned to - new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from - new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass) // chunked interval list to use with Queue's scatter/gather functionality - val LowPassAugust = new Target("ALL.august.v4", b37, "b37", // BUGBUG: kill this, it is too large - new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass) - val LowPassEUR363Nov = new Target("EUR.nov2010", b37, "b37", - new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/rpoplin/haplotypeScore/sting_dev_oldQD_hs10/logs/EUR.nov.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass) - val LowPassFIN79Nov = new Target("FIN.nov2010", b37, "b37", - new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), - new File("/broad/shptmp/rpoplin/pipeline_newHS7/FIN.nov2010.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass) - val TGPWExGdA = new Target("1000G.WEx.GdA", b37, "b37", - new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people - new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass) - - //val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, LowPassEUR363Nov, LowPassFIN79Nov, TGPWExGdA) - val targets = List(LowPassEUR363Nov) - def script = { - def goldStandard = true - - var gaussianList = List(6) - var shrinkageList = List(0.0001) - var dirichletList = List(1000.0) - var backoffList = List(1.3) - var qualCutoffList = List(100) - var stdList = List(4.5) - var useQDList = List(1) - var useSBList = List(1) - var useHSList = List(1) - var useHRUNList = List(1) - var useMQRSTList = List(0) - var useBQRSTList = List(0) - var useGCList = List(0) - var useMQList = List(0) - var useSumGLList = List(0) - var trainOmniList = List(1) - - if(experiment == "0000") { - gaussianList = List(6,16) - trainOmniList = List(0,1) - useMQRSTList = List(0,1) - } - if(experiment == "0001") { - gaussianList = List(6, 16) - shrinkageList = List(0.0001, 0.01) - dirichletList = List(0.001, 1000.0) - backoffList = List(0.7, 1.0, 1.3) - useQDList = List(0,1) - useSBList = List(0,1) - useHSList = List(0,1) - useHRUNList = List(0,1) - useMQRSTList = List(0,1) - useBQRSTList = List(0,1) - useSumGLList = List(0,1) - trainOmniList = List(0,1) - } - if(experiment == "0002") { - gaussianList = List(2, 10, 50) - stdList = List(2.0, 4.5, 8.5) - dirichletList = List(0.0001, 0.01) - backoffList = List(0.5, 0.6, 0.9) - useQDList = List(1) - useSBList = List(0,1) - useHSList = List(0,1) - useHRUNList = List(0) - useMQRSTList = List(0,1) - useBQRSTList = List(0) - useSumGLList = List(0,1) - useGCList = List(0,1) - useMQList = List(0,1) - trainOmniList = List(0,1) - } - if(experiment == "0003") { - qualCutoffList = List(5, 40, 100, 400) - shrinkageList = List(0.0001, 0.001, 0.1) - dirichletList = List(0.0001, 0.001, 0.01) - useQDList = List(1) - useSBList = List(0,1) - useHSList = List(1) - useHRUNList = List(0) - useMQRSTList = List(0,1) - useBQRSTList = List(0,1) - useGCList = List(0,1) - useMQList = List(0,1) - useSumGLList = List(0,1) - trainOmniList = List(0,1) - } - if(experiment == "0004") { - gaussianList = List(5, 25) - shrinkageList = List(0.01, 1.0, 100.0) - dirichletList = List(0.001, 10.0, 1000.0) - backoffList = List(0.6, 1.0, 1.4) - useQDList = List(1) - useSBList = List(1) - useHSList = List(0,1) - useHRUNList = List(0,1) - useMQRSTList = List(0,1) - useBQRSTList = List(0,1) - useGCList = List(0,1) - useMQList = List(0,1) - } - if(experiment == "0005") { - gaussianList = List(4,50,100) - shrinkageList = List(0.0001, 10.0) - dirichletList = List(0.0001, 0.001) - backoffList = List(0.2, 0.3, 0.6) - stdList = List(0.5, 1.0, 10.0) - useQDList = List(1) - useSBList = List(1) - useHSList = List(1) - useHRUNList = List(0,1) - useMQRSTList = List(0,1) - useBQRSTList = List(0,1) - useGCList = List(0,1) - useMQList = List(0) - trainOmniList = List(0,1) - } - - - - for (target <- targets) { - - - - for(gaussian: Int <- gaussianList) { - for(shrinkage: Double <- shrinkageList) { - for(dirichlet: Double <- dirichletList) { - for(backoff: Double <- backoffList) { - for(qualCutoff: Int <- qualCutoffList) { - for(std: Double <- stdList) { - for(useQD: Int <- useQDList ) { - for(useSB: Int <- useSBList ) { - for(useHS: Int <- useHSList ) { - for(useHRUN: Int <- useHRUNList ) { - for(useMQRST: Int <- useMQRSTList ) { - for(useBQRST: Int <- useBQRSTList ) { - for(useGC: Int <- useGCList ) { - for(useMQ: Int <- useMQList ) { - for(useSumGL: Int <- useSumGLList ) { - for(trainOmni: Int <- trainOmniList) { - - target.gaussian = gaussian - target.shrinkage = shrinkage - target.dirichlet = dirichlet - target.backoff = backoff - target.qualCutoff = qualCutoff - target.std = std - target.useQD = useQD - target.useSB = useSB - target.useHS = useHS - target.useHRUN = useHRUN - target.useMQRST = useMQRST - target.useBQRST = useBQRST - target.useGC = useGC - target.useMQ = useMQ - target.useSumGL = useSumGL - target.trainOmni = trainOmni - val clustersName: String = "%s_%d_%.4f_%.4f_%.1f_%d_%.1f_%d%d%d%d%d%d%d%d%d_%d.clusters".format(target.name, target.gaussian, target.shrinkage, target.dirichlet, target.backoff, target.qualCutoff, target.std, target.useQD, target.useSB, target.useHS, target.useHRUN, target.useMQRST, target.useBQRST, target.useGC, target.useMQ, target.useSumGL, target.trainOmni) - target.goldStandardClusterFile = new File(clustersName) - add(new GenerateVariantClusters(target, goldStandard)) - add(new VariantRecalibratorTiTv(target, goldStandard)) - add(new VariantRecalibratorNRS(target, goldStandard)) - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } - - def bai(bam: File) = new File(bam + ".bai") - - val FiltersToIgnore = List("DPFilter", "ABFilter", "ESPStandard", "QualByDepth", "StrandBias", "HomopolymerRun") - - // 3.) VQSR part1 Generate Gaussian clusters based on truth sites - class GenerateVariantClusters(t: Target, goldStandard: Boolean) extends org.broadinstitute.sting.queue.extensions.gatk.GenerateVariantClusters with UNIVERSAL_GATK_ARGS { - val name: String = if ( goldStandard ) { t.goldStandardName } else { t.name } - this.reference_sequence = t.reference - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod") - this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - if(t.trainOmni == 0) { - this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf") - this.rodBind :+= RodBind("truth", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - } else { - this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf") - this.rodBind :+= RodBind("truth", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf") - } - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } ) - this.clusterFile = if ( goldStandard ) { t.goldStandardClusterFile } else { t.clusterFile } - //this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - if(t.useQD == 1) { - this.use_annotation ++= List("QD") - } - if(t.useSB == 1) { - this.use_annotation ++= List("SB") - } - if(t.useHS == 1) { - this.use_annotation ++= List("HaplotypeScore1") - } - if(t.useHRUN == 1) { - this.use_annotation ++= List("HRun") - } - if(t.useMQRST == 1) { - this.use_annotation ++= List("MQRankSum") - } - if(t.useBQRST == 1) { - this.use_annotation ++= List("BaseQRankSum") - } - if(t.useGC == 1) { - this.use_annotation ++= List("GC") - } - if(t.useMQ == 1) { - this.use_annotation ++= List("MQ") - } - if(t.useSumGL == 1) { - this.use_annotation ++= List("sumGLbyD+") - } - if( t.useQD==0 && t.useSB==0 && t.useHS==0 && t.useHRUN==0 && t.useMQRST==0 && t.useBQRST==0 && t.useGC==0 && t.useMQ==0 && t.useSumGL==0) { - this.use_annotation ++= List("MQ","QD","DP") - } - this.analysisName = name + "_GVC" - this.intervalsString ++= List(t.intervals) - this.qual = t.qualCutoff - this.std = t.std - this.mG = t.gaussian - this.ignoreFilter ++= FiltersToIgnore - this.dirichlet = t.dirichlet - this.shrinkage = t.shrinkage - } - - // 4.) VQSR part2 Calculate new LOD for all input SNPs by evaluating the Gaussian clusters - class VariantRecalibratorBase(t: Target, goldStandard: Boolean) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS { - val name: String = if ( goldStandard ) { t.goldStandardName } else { t.name } - this.reference_sequence = t.reference - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod") - this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - if(t.trainOmni == 0) { - this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf") - this.rodBind :+= RodBind("truth", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") - } else { - this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf") - this.rodBind :+= RodBind("truth", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf") - } - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } ) - this.clusterFile = if ( goldStandard ) { t.goldStandardClusterFile } else { t.clusterFile } - this.analysisName = name + "_VR" - this.intervalsString ++= List(t.intervals) - this.ignoreFilter ++= FiltersToIgnore - this.ignoreFilter ++= List("HARD_TO_VALIDATE") - this.target_titv = t.titvTarget - this.backOff = t.backoff - } - - // 4a.) Choose VQSR tranches based on novel ti/tv - class VariantRecalibratorTiTv(t: Target, goldStandard: Boolean) extends VariantRecalibratorBase(t, goldStandard) { - this.tranche ++= List("1.0") - this.out = new File("/dev/null") - val tranchesName: String = "%s_%d_%.4f_%.4f_%.1f_%d_%.1f_%d%d%d%d%d%d%d%d%d_%d.titv.tranches".format(this.name, t.gaussian, t.shrinkage, t.dirichlet, t.backoff, t.qualCutoff, t.std, t.useQD, t.useSB, t.useHS, t.useHRUN, t.useMQRST, t.useBQRST, t.useGC, t.useMQ, t.useSumGL, t.trainOmni) - this.tranchesFile = new File(tranchesName) - } - - // 4b.) Choose VQSR tranches based on sensitivity to truth set - class VariantRecalibratorNRS(t: Target, goldStandard: Boolean) extends VariantRecalibratorBase(t, goldStandard) { - this.sm = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY - if(t.trainOmni == 0 ) { - this.tranche ++= List("1.0") - } else { - this.tranche ++= List("2.5") - } - this.out = new File("/dev/null") - val tranchesName: String = "%s_%d_%.4f_%.4f_%.1f_%d_%.1f_%d%d%d%d%d%d%d%d%d_%d.ts.tranches".format(this.name, t.gaussian, t.shrinkage, t.dirichlet, t.backoff, t.qualCutoff, t.std, t.useQD, t.useSB, t.useHS, t.useHRUN, t.useMQRST, t.useBQRST, t.useGC, t.useMQ, t.useSumGL, t.trainOmni) - this.tranchesFile = new File(tranchesName) - } -} diff --git a/scala/qscript/playground/HybridSelectionPipeline.scala b/scala/qscript/playground/HybridSelectionPipeline.scala deleted file mode 100644 index 533c2f8d3..000000000 --- a/scala/qscript/playground/HybridSelectionPipeline.scala +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.function.ListWriterFunction -import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals -import org.broadinstitute.sting.queue.QScript -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.broad.PicardPipeline - -class HybridSelectionPipeline extends QScript { - qscript => - - @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") - var yamlFile: File = _ - - @Input(doc="level of parallelism for UnifiedGenotyper. By default set to 20.", shortName="varScatter", required=false) - var variantCallerScatterCount = 20 - - @Argument(doc="memory limit for UnifiedGenotyper. By default set to 2g.", shortName="varMemory", required=false) - var variantCallerMemory = 2 - - @Argument(doc="expand each target in input intervals by the specified number of bases. By default set to 50 bases.", shortName="expand", required=false) - var expandIntervals = 50 - - @Argument(doc="pipeline memory limit. By default set to 2g.", shortName="pipeMemory", required=false) - var pipelineMemoryLimit = 2 - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.reference_sequence = qscript.pipeline.getProject.getReferenceFile - this.intervals = List(qscript.pipeline.getProject.getIntervalList) - this.memoryLimit = pipelineMemoryLimit - } - - def script() { - pipeline = PicardPipeline.parse(qscript.yamlFile) - - val projectBase = qscript.pipeline.getProject.getName - val bamType = "cleaned" - - val writeBamList = new ListWriterFunction - writeBamList.inputFiles = qscript.pipeline.getSamples.filter(_.getBamFiles.contains(bamType)).map(_.getBamFiles.get(bamType)).toList - writeBamList.listFile = projectBase +".bam.list" - writeBamList.jobOutputFile = writeBamList.listFile + ".out" - add(writeBamList) - - val flankIntervals = projectBase + ".flanks.intervals" - - if (qscript.expandIntervals > 0) { - val ei = new ExpandIntervals( - qscript.pipeline.getProject.getIntervalList, - 1, - qscript.expandIntervals, - flankIntervals, - qscript.pipeline.getProject.getReferenceFile, - "INTERVALS", - "INTERVALS") - ei.jobOutputFile = ei.outList + ".out" - - add(ei) - } - - trait ExpandedIntervals extends CommandLineGATK { - if (qscript.expandIntervals > 0) - this.intervals :+= flankIntervals - } - - val call = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals - call.input_file = List(writeBamList.listFile) - call.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) - call.downsample_to_coverage = 600 - call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH - call.GSA_PRODUCTION_ONLY = true - call.out = projectBase + ".unfiltered.vcf" - call.jobOutputFile = call.out + ".out" - call.scatterCount = qscript.variantCallerScatterCount - call.memoryLimit = qscript.variantCallerMemory - add(call) - - val selectSNPs = new SelectVariants with CommandLineGATKArgs with ExpandedIntervals - selectSNPs.selectSNPs = true - selectSNPs.rodBind :+= RodBind("variant", "VCF", call.out) - selectSNPs.out = projectBase + ".snps.unfiltered.vcf" - selectSNPs.jobOutputFile = selectSNPs.out + ".out" - add(selectSNPs) - - val selectIndels = new SelectVariants with CommandLineGATKArgs with ExpandedIntervals - selectIndels.selectIndels = true - selectIndels.rodBind :+= RodBind("variant", "VCF", call.out) - selectIndels.out = projectBase + ".indels.unfiltered.vcf" - selectIndels.jobOutputFile = selectIndels.out + ".out" - add(selectIndels) - - val filterSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals - filterSNPs.variantVCF = selectSNPs.out - filterSNPs.filterName = List("SNP_SB", "SNP_QD", "SNP_HRun") - filterSNPs.filterExpression = List("\"SB>=0.10\"", "\"QD<5.0\"", "\"HRun>=4\"") - filterSNPs.clusterWindowSize = 10 - filterSNPs.clusterSize = 3 - filterSNPs.out = projectBase + ".snps.filtered.vcf" - filterSNPs.jobOutputFile = filterSNPs.out + ".out" - add(filterSNPs) - - val filterIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals - filterIndels.variantVCF = selectIndels.out - filterIndels.filterName = List("Indel_QUAL", "Indel_SB", "Indel_QD") - filterIndels.filterExpression = List("\"QUAL<30.0\"", "\"SB>-1.0\"", "\"QD<2.0\"") - filterIndels.out = projectBase + ".indels.filtered.vcf" - filterIndels.jobOutputFile = filterIndels.out + ".out" - add(filterIndels) - - val combineSNPsIndels = new CombineVariants with CommandLineGATKArgs with ExpandedIntervals - combineSNPsIndels.rodBind :+= RodBind("indels", "VCF", filterIndels.out) - combineSNPsIndels.rodBind :+= RodBind("snps", "VCF", filterSNPs.out) - combineSNPsIndels.rod_priority_list = "indels,snps" - combineSNPsIndels.filteredrecordsmergetype = org.broadinstitute.sting.utils.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - combineSNPsIndels.assumeIdenticalSamples = true - combineSNPsIndels.out = projectBase + ".unannotated.vcf" - combineSNPsIndels.jobOutputFile = combineSNPsIndels.out + ".out" - add(combineSNPsIndels) - - val annotate = new GenomicAnnotator with CommandLineGATKArgs with ExpandedIntervals - annotate.rodBind :+= RodBind("variant", "VCF", combineSNPsIndels.out) - annotate.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable) - annotate.rodToIntervalTrackName = "variant" - annotate.out = projectBase + ".vcf" - annotate.jobOutputFile = annotate.out + ".out" - add(annotate) - - val targetEval = new VariantEval with CommandLineGATKArgs - targetEval.rodBind :+= RodBind("eval", "VCF", annotate.out) - targetEval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp) - targetEval.doNotUseAllStandardStratifications = true - targetEval.doNotUseAllStandardModules = true - targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample") - targetEval.out = projectBase + ".eval" - targetEval.jobOutputFile = targetEval.out + ".out" - add(targetEval) - - if (qscript.expandIntervals > 0) { - val flanksEval = new VariantEval with CommandLineGATKArgs - flanksEval.rodBind :+= RodBind("eval", "VCF", annotate.out) - flanksEval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp) - flanksEval.intervals = List(flankIntervals) - flanksEval.doNotUseAllStandardStratifications = true - flanksEval.doNotUseAllStandardModules = true - flanksEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - flanksEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample") - flanksEval.out = projectBase + ".flanks.eval" - flanksEval.jobOutputFile = flanksEval.out + ".out" - add(flanksEval) - } - } -} diff --git a/scala/qscript/playground/WholeGenomePipeline.scala b/scala/qscript/playground/WholeGenomePipeline.scala deleted file mode 100644 index 088528ac0..000000000 --- a/scala/qscript/playground/WholeGenomePipeline.scala +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import io.Source -import org.broadinstitute.sting.queue.extensions.samtools.{SamtoolsIndexFunction, SamtoolsMergeFunction} -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.utils.interval.IntervalUtils - -class WholeGenomePipeline extends QScript { - @Input(doc="Bam file list", shortName = "I", required=true) - var bamList: File = _ - - @Input(doc="Exclude intervals list", shortName = "XL", required=false) - var excludeIntervals: List[File] = Nil - - @Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true) - var cleanerTmpDir: String = _ - - @Argument(doc="Flag for running the whole genome (wg) or chromosome 20 (chr20). The default is chr20.", shortName="runType", required=false) - var runType = "chr20" - - @Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false) - var chunkSize = 3000000 - - @Argument(doc="Memory limit. Defaults to 4g", shortName="pipeMemory", required=false) - var pipelineMemoryLimit = 4 - - val resources = "/humgen/gsa-pipeline/resources/5777/b37/" - val reference = resources + "human_g1k_v37.fasta" - val dbsnp = resources + "dbsnp_132.b37.vcf" - val indels = resources + "1000G_indels_for_realignment.b37.vcf" - val omni = resources + "1000G_omni2.5.b37.sites.vcf" - val hapmap = resources + "hapmap_3.3.b37.sites.vcf" - - trait CommandLineGATKArgs extends CommandLineGATK { - this.reference_sequence = reference - this.intervalsString = runIntervals - this.memoryLimit = pipelineMemoryLimit - } - - case class Interval(chr: String, start: Long, stop: Long) { - override def toString = "%s:%d-%d".format(chr, start, stop) - } - - var runIntervals = List.empty[String] - - def script() { - - var intervals = Traversable.empty[Interval] - - runType = runType.toLowerCase - if (runType == "wg") { - val contigs = (1 to 22).map(_.toString) ++ List("X", "Y", "MT") - val sizes = IntervalUtils.getContigSizes(reference) - intervals = contigs.map(chr => new Interval(chr, 1, sizes.get(chr).longValue)) - runIntervals = Nil - } else { - val locs = Map( - "cent1" -> new Interval("1", 121429168, 121529168), - "cent16" -> new Interval("16", 40844464, 40944464), - "chr20" -> new Interval("20", 1, 63025520), - "chr20_100k" -> new Interval("20", 100001, 200000)) - - locs.get(runType) match { - case Some(range) => - intervals = List(range) - runIntervals = List(range.toString) - case None => - throw new RuntimeException("Invalid runType: " + runType + ". Must be one of: " + locs.keys.mkString(", ") + ", or wg") - } - } - - val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_)) - val projectBase = project + "." + runType - - val mergeBam = new SamtoolsMergeFunction - mergeBam.inputBams = Source.fromFile(bamList).getLines().toList - if (runType != "wg") - mergeBam.region = intervals.head.toString - mergeBam.memoryLimit = pipelineMemoryLimit - mergeBam.outputBam = cleanerTmpDir + "/" + projectBase + ".unclean.bam" - mergeBam.jobOutputFile = projectBase + ".unclean.bam.out" - mergeBam.isIntermediate = true - mergeBam.memoryLimit = pipelineMemoryLimit - add(mergeBam) - - val indexBam = new SamtoolsIndexFunction - indexBam.bamFile = mergeBam.outputBam - indexBam.memoryLimit = pipelineMemoryLimit - indexBam.jobOutputFile = projectBase + ".unclean.bam.bai.out" - indexBam.isIntermediate = true - add(indexBam) - - var chunkVcfs = List.empty[File] - for (interval <- intervals) { - val chr = interval.chr - val chrStart = interval.start - val chrStop = interval.stop - - var start = chrStart - var chunkNumber = 1 - - while (start <= chrStop) { - val stop = (start + chunkSize - 1) min chrStop - - val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber - val tmpBase: String = cleanerTmpDir + "/" + chunkBase - - val chunkInterval = List("%s:%d-%d".format(chr, start, stop)) - - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.input_file :+= mergeBam.outputBam - target.intervalsString = chunkInterval - target.excludeIntervals = excludeIntervals - target.mismatchFraction = 0.0 - target.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) - target.rodBind :+= RodBind("indels", "VCF", indels) - target.out = tmpBase + ".target.intervals" - target.jobOutputFile = chunkBase + ".target.intervals.out" - target.isIntermediate = true - add(target) - - val clean = new IndelRealigner with CommandLineGATKArgs - clean.input_file :+= mergeBam.outputBam - clean.intervalsString = chunkInterval - clean.excludeIntervals = excludeIntervals - clean.targetIntervals = target.out - clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) - clean.rodBind :+= RodBind("indels", "VCF", indels) - clean.consensusDeterminationModel = org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel.USE_READS - clean.simplifyBAM = true - clean.bam_compression = 1 - clean.out = tmpBase + ".cleaned.bam" - clean.jobOutputFile = chunkBase + ".cleaned.bam.out" - clean.isIntermediate = true - add(clean) - - val call = new UnifiedGenotyper with CommandLineGATKArgs - call.input_file :+= clean.out - call.intervalsString = chunkInterval - call.excludeIntervals = excludeIntervals - call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) - call.downsample_to_coverage = 50 - call.standard_min_confidence_threshold_for_calling = 4.0 - call.standard_min_confidence_threshold_for_emitting = 4.0 - call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY - call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH - call.GSA_PRODUCTION_ONLY = true - call.out = chunkBase + ".vcf" - call.jobOutputFile = call.out + ".out" - add(call) - - chunkVcfs :+= call.out - start += chunkSize - chunkNumber += 1 - } - } - - val combineChunks = new CombineVariants with CommandLineGATKArgs - combineChunks.rodBind = chunkVcfs.zipWithIndex.map { case (vcf, index) => RodBind("input"+index, "VCF", vcf) } - combineChunks.rod_priority_list = chunkVcfs.zipWithIndex.map { case (vcf, index) => "input"+index }.mkString(",") - combineChunks.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - combineChunks.assumeIdenticalSamples = true - combineChunks.out = projectBase + ".unfiltered.vcf" - combineChunks.jobOutputFile = combineChunks.out + ".out" - add(combineChunks) - - val selectSNPs = new SelectVariants with CommandLineGATKArgs - selectSNPs.selectSNPs = true - selectSNPs.rodBind :+= RodBind("variant", "VCF", combineChunks.out) - selectSNPs.out = projectBase + ".snps.unrecalibrated.vcf" - selectSNPs.jobOutputFile = selectSNPs.out + ".out" - add(selectSNPs) - - val selectIndels = new SelectVariants with CommandLineGATKArgs - selectIndels.selectIndels = true - selectIndels.rodBind :+= RodBind("variant", "VCF", combineChunks.out) - selectIndels.out = projectBase + ".indels.unfiltered.vcf" - selectIndels.jobOutputFile = selectIndels.out + ".out" - add(selectIndels) - - val filterIndels = new VariantFiltration with CommandLineGATKArgs - filterIndels.variantVCF = selectIndels.out - filterIndels.filterName = List("Indel_QUAL", "Indel_SB", "Indel_QD", "Indel_HRun", "Indel_HaplotypeScore") - filterIndels.filterExpression = List("\"QUAL<30.0\"", "\"SB>-1.0\"", "\"QD<2.0\"", "\"HRun>15\"", "\"HaplotypeScore>20.0\"") - filterIndels.out = projectBase + ".indels.filtered.vcf" - filterIndels.jobOutputFile = filterIndels.out + ".out" - add(filterIndels) - - val combineSNPsIndels = new CombineVariants with CommandLineGATKArgs - combineSNPsIndels.rodBind :+= RodBind("indels", "VCF", selectIndels.out) - combineSNPsIndels.rodBind :+= RodBind("snps", "VCF", selectSNPs.out) - combineSNPsIndels.rod_priority_list = "indels,snps" - combineSNPsIndels.filteredRecordsMergeType = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - combineSNPsIndels.assumeIdenticalSamples = true - combineSNPsIndels.out = projectBase + ".unrecalibrated.vcf" - combineSNPsIndels.jobOutputFile = combineSNPsIndels.out + ".out" - add(combineSNPsIndels) - - val vr = new VariantRecalibrator with CommandLineGATKArgs - vr.rodBind :+= RodBind("input", "VCF", combineSNPsIndels.out) - vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0") - vr.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=false,prior=12.0") - vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true,training=false,truth=false,prior=8.0") - vr.trustAllPolymorphic = true - vr.use_annotation = List("QD", "HaplotypeScore", "HRun", "MQRankSum", "ReadPosRankSum") - vr.TStranche = List( - "100.0", "99.9", "99.5", "99.3", - "99.0", "98.9", "98.8", - "98.5", "98.4", "98.3", "98.2", "98.1", - "98.0", - "97.0", - "95.0", - "90.0") - vr.tranches_file = projectBase + ".tranches" - vr.recal_file = projectBase + ".recal" - vr.jobOutputFile = vr.recal_file + ".out" - vr.memoryLimit = 32 - add(vr) - - for (tranche <- vr.TStranche) { - val ar = new ApplyRecalibration with CommandLineGATKArgs - ar.rodBind :+= RodBind("input", "VCF", combineSNPsIndels.out) - ar.tranches_file = vr.tranches_file - ar.recal_file = vr.recal_file - ar.ts_filter_level = tranche.toDouble - ar.out = projectBase + ".recalibrated." + tranche + ".vcf" - ar.jobOutputFile = ar.out + ".out" - ar.memoryLimit = 32 - add(ar) - - val eval = new VariantEval with CommandLineGATKArgs - eval.tranchesFile = vr.tranches_file - eval.rodBind :+= RodBind("eval", "VCF", ar.out) - eval.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) - eval.doNotUseAllStandardStratifications = true - eval.doNotUseAllStandardModules = true - eval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - eval.stratificationModule = List("EvalRod", "CompRod", "Novelty") - eval.out = swapExt(ar.out, ".vcf", ".eval") - eval.jobOutputFile = eval.out + ".out" - eval.memoryLimit = 32 - add(eval) - } - } -} diff --git a/scala/qscript/playground/recalibrate.scala b/scala/qscript/playground/recalibrate.scala deleted file mode 100755 index f57991fee..000000000 --- a/scala/qscript/playground/recalibrate.scala +++ /dev/null @@ -1,90 +0,0 @@ -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.QScript -import org.apache.commons.io.FilenameUtils; - -class recalibrate extends QScript { - @Input(doc="bamIn", shortName="I") - var bamIns: List[File] = Nil - - @Argument(doc="scatter") - var scatter = false - - @Argument(doc="gatk jar file") - var gatkJarFile: File = _ - - @Argument(doc="Assume initial count covariates has completed", required=false) - var skipInitialCountCovariates: Boolean = false - - @Argument(doc="", required=false) - var skipUQUpdateArg: Boolean = false - - @Argument(shortName = "R", doc="ref") - var referenceFile: File = _ - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; reference_sequence = referenceFile; } - -def script = { - for (bamIn <- bamIns) { - val root = bamIn.getPath() - val bamRoot = FilenameUtils.removeExtension(root); - val recalData = new File(bamRoot + ".recal_data.csv") - val recalBam = new File(bamRoot + ".recal.bam") - val recalRecalData = new File(bamRoot + ".recal.recal_data.csv") - if ( ! skipInitialCountCovariates ) - add(new CountCovariates(bamIn, recalData) { useOriginalQualities = true } ) - val tableRecal = new TableRecalibrate(bamIn, recalData, recalBam) { useOriginalQualities = true } - if ( scatter ) { - tableRecal.intervals = List(new File("/humgen/gsa-hpprojects/GATK/data/chromosomes.hg18.interval_list")) - tableRecal.scatterCount = 25 - } - add(tableRecal) - add(new Index(recalBam)) - add(new CountCovariates(recalBam, recalRecalData) { num_threads = 4 }) - add(new AnalyzeCovariates(recalData, new File(recalData.getPath() + ".analyzeCovariates"))) - add(new AnalyzeCovariates(recalRecalData, new File(recalRecalData.getPath() + ".analyzeCovariates"))) - } -} - -def bai(bam: File) = new File(bam + ".bai") - -class Index(bamIn: File) extends SamtoolsIndexFunction { - bamFile = bamIn -} - -class CountCovariates(bamIn: File, recalDataIn: File) extends org.broadinstitute.sting.queue.extensions.gatk.CountCovariates with UNIVERSAL_GATK_ARGS { - this.jarFile = gatkJarFile - this.input_file :+= bamIn - this.recal_file = recalDataIn - this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod") - this.logging_level = "INFO" - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - this.memoryLimit = 3 - - override def dotString = "CountCovariates: %s [args %s]".format(bamIn.getName, if (this.num_threads.isDefined) "-nt " + this.num_threads else "") -} - -class TableRecalibrate(bamInArg: File, recalDataIn: File, bamOutArg: File) extends org.broadinstitute.sting.queue.extensions.gatk.TableRecalibration with UNIVERSAL_GATK_ARGS { - this.jarFile = gatkJarFile - this.input_file :+= bamInArg - this.recal_file = recalDataIn - this.out = bamOutArg - this.logging_level = "INFO" - this.memoryLimit = 2 - this.skipUQUpdate = skipUQUpdateArg - - override def dotString = "TableRecalibrate: %s => %s".format(bamInArg.getName, bamOutArg.getName, if (this.useOriginalQualities) " -OQ" else "") -} - -class AnalyzeCovariates(recalDataIn: File, outputDir: File) extends org.broadinstitute.sting.queue.extensions.gatk.AnalyzeCovariates { - this.jarFile = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/AnalyzeCovariates.jar") - this.recal_file = recalDataIn - this.output_dir = outputDir.toString - this.path_to_resources = "/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/R/" - this.ignoreQ = 5 - this.path_to_Rscript = "/broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.7.2/bin/Rscript" - this.memoryLimit = 2 - - override def dotString = "AnalyzeCovariates: %s".format(recalDataIn.getName) -} -} diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/HybridSelectionPipelineTest.scala b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/HybridSelectionPipelineTest.scala deleted file mode 100644 index 362e0536e..000000000 --- a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/HybridSelectionPipelineTest.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.pipeline.playground - -import org.testng.annotations.{DataProvider, Test} -import org.broadinstitute.sting.datasources.pipeline.{PipelineSample, Pipeline} -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.queue.pipeline._ -import org.broadinstitute.sting.BaseTest - -class HybridSelectionPipelineTest { - def datasets = List(k1gChr20Dataset) - - val k1gChr20Dataset = { - val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20", BaseTest.hg19Chr20Intervals) - - dataset.validations :+= new IntegerValidation("CountVariants", "dbsnp.eval.called.all.all.all", "nCalledLoci", 1392) - dataset.validations :+= new IntegerValidation("CountVariants", "dbsnp.eval.called.all.known.all", "nCalledLoci", 1143) - dataset.validations :+= new IntegerValidation("CountVariants", "dbsnp.eval.called.all.novel.all", "nCalledLoci", 249) - dataset.validations :+= new DoubleValidation("TiTvVariantEvaluator", "dbsnp.eval.called.all.all.all", "tiTvRatio", 3.6250) - dataset.validations :+= new DoubleValidation("TiTvVariantEvaluator", "dbsnp.eval.called.all.known.all", "tiTvRatio", 3.7190) - dataset.validations :+= new DoubleValidation("TiTvVariantEvaluator", "dbsnp.eval.called.all.novel.all", "tiTvRatio", 3.2037) - - dataset - } - - def newK1gDataset(projectName: String, intervals: String) = { - val project = PipelineTest.createHg19Project(projectName, intervals) - var samples = List.empty[PipelineSample] - for (k1gBam <- PipelineTest.k1gBams) - samples :+= PipelineTest.createK1gSample(projectName, k1gBam) - new PipelineDataset(PipelineTest.createPipeline(project, samples)) - } - - @DataProvider(name="datasets")//, parallel=true) - final def convertDatasets: Array[Array[AnyRef]] = - datasets.map(dataset => Array(dataset.asInstanceOf[AnyRef])).toArray - - @Test(dataProvider="datasets") - def testHybridSelectionPipeline(dataset: PipelineDataset) { - val projectName = dataset.pipeline.getProject.getName - val testName = "HybridSelectionPipeline-" + projectName - val yamlFile = writeYaml(testName, dataset.pipeline) - - // Run the pipeline with the expected inputs. - val pipelineCommand = - "-retry 1 -S scala/qscript/playground/HybridSelectionPipeline.scala -Y %s" - .format(yamlFile) - - val pipelineSpec = new PipelineTestSpec - pipelineSpec.name = testName - pipelineSpec.args = pipelineCommand - pipelineSpec.jobQueue = dataset.jobQueue - - pipelineSpec.evalSpec = new PipelineTestEvalSpec - pipelineSpec.evalSpec.evalReport = projectName + ".eval" - pipelineSpec.evalSpec.validations = dataset.validations - - PipelineTest.executeTest(pipelineSpec) - } - - private def writeYaml(testName: String, pipeline: Pipeline) = { - val yamlFile = BaseTest.createTempFile(pipeline.getProject.getName + ".", ".yaml") - YamlUtils.dump(pipeline, yamlFile) - yamlFile - } - - class PipelineDataset(var pipeline: Pipeline = null, - var validations: List[PipelineValidation[_]] = Nil, - var jobQueue: String = null) { - override def toString = pipeline.getProject.getName - } -} diff --git a/shell/TraverseTest.sh b/shell/TraverseTest.sh deleted file mode 100755 index 031151ec4..000000000 --- a/shell/TraverseTest.sh +++ /dev/null @@ -1 +0,0 @@ -java -Xmx8192m -jar dist/GenomeAnalysisTK.jar $* diff --git a/shell/TraverseTestProf.sh b/shell/TraverseTestProf.sh deleted file mode 100755 index a0478611c..000000000 --- a/shell/TraverseTestProf.sh +++ /dev/null @@ -1 +0,0 @@ -java -Xmx4096m -agentlib:hprof=cpu=samples,depth=10 -jar dist/GenomeAnalysisTK.jar $* diff --git a/shell/fileSystemSizes.csh b/shell/fileSystemSizes.csh deleted file mode 100755 index 9c943ce46..000000000 --- a/shell/fileSystemSizes.csh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/tcsh - -setenv DATE `date +"%m_%d_%Y"` -setenv RESULTS "fs_sizes.$DATE.txt" - -rm -f $RESULTS -foreach fs ( /humgen/gsa-scr1/ /humgen/gsa-hphome1/ /humgen/gsa-hpprojects /humgen/gsa-lpprojects ) - du -sh $fs/* >> $RESULTS -end diff --git a/shell/firehose/getFirehoseCurlTsv.sh b/shell/firehose/getFirehoseCurlTsv.sh deleted file mode 100755 index bd0acec26..000000000 --- a/shell/firehose/getFirehoseCurlTsv.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/sh - -# Downloads a set of samples from Firehose using the Firehose API and generate a TSV file with the outputs. -# see: http://iwww.broadinstitute.org/cancer/cga/wiki/index.php/GetAnnotations - -ENTITY_SET_ID=$1 -ENTITY_SET_TYPE=Sample_Set -ENTITY_TYPE=Sample -PASSWORD_FILE=$2 - -if [ "$ENTITY_SET_ID" == "" ]; then - EXIT_USAGE=1 -fi - -if [ "$PASSWORD_FILE" == "" ]; then - echo 'Missing password file with the contents: "-u :"' >&2 - EXIT_USAGE=1 -fi - -if [ $EXIT_USAGE ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -# Firehose variables - -FIREHOSE_HOST=firehose -FIREHOSE_PORT=8080 -FIREHOSE_DOMAIN=gsa -FIREHOSE_WORKSPACE=trunk - -# TSV file to write - -PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv - -# Annotations to pull down from Firehose - -FIREHOSE_ANNOTATIONS=(reference_file interval_list recalibrated_bam_file squid_project collaborator_id) - -index=0 -count=${#FIREHOSE_ANNOTATIONS[@]} -FIREHOSE_VARIABLES="" - -# Build the tab separated list of firehose arguments - -while [ "$index" -lt "$count" ]; do - FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'&annotationTypes='${FIREHOSE_ANNOTATIONS[$index]} - let "index = $index + 1" -done - -curl --fail -sL -K "$PASSWORD_FILE" -o "$PIPELINE_TSV_FILE" \ - "http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN/ws/entity/getAnnotations/$ENTITY_TYPE?entityNames=$ENTITY_SET_ID&filterSetType=$ENTITY_SET_TYPE&workspaceName=$FIREHOSE_WORKSPACE$FIREHOSE_VARIABLES" || \ - -EXIT_CODE=$? - -if [[ $EXIT_CODE -ne 0 ]]; then - echo "curl failed with exit code:" $EXIT_CODE >&2 - echo 'Check the name of your Sample_Set and that your password file '$PASSWORD_FILE' is setup correctly with: "-u :"' >&2 - echo "If that doesn't work make sure you can login to the firehose website: http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN" >&2 - exit $EXIT_CODE -fi diff --git a/shell/firehose/getFirehosePipelineYaml.sh b/shell/firehose/getFirehosePipelineYaml.sh deleted file mode 100755 index 604affcd3..000000000 --- a/shell/firehose/getFirehosePipelineYaml.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -# Downloads a set of samples from Firehose and generates a YAML file. - -DIR=`dirname $0` -if [ "$2" == "" ]; then - $DIR/getFirehoseTestTsv.sh $1 && $DIR/pipelineTsvToYaml.sh $1.tsv -else - $DIR/getFirehoseCurlTsv.sh $1 $2 && $DIR/pipelineTsvToYaml.sh $1.tsv -fi diff --git a/shell/firehose/getFirehoseTestTsv.sh b/shell/firehose/getFirehoseTestTsv.sh deleted file mode 100755 index 1fdb4335a..000000000 --- a/shell/firehose/getFirehoseTestTsv.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/sh - -# Downloads a set of samples from Firehose using the obsolete Firehose Test Harness and generate a TSV file with the outputs. - -ENTITY_SET_ID=$1 -ENTITY_SET_TYPE=Sample_Set -ENTITY_TYPE=Sample - -if [ "$ENTITY_SET_ID" == "" ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -# Firehose variables - -FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source -CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis -FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py" -FIREHOSE_HOST=firehose -FIREHOSE_PORT=8080 -FIREHOSE_DOMAIN=gsa -FIREHOSE_WORKSPACE=trunk - -# TSV file to write - -PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv - -# Annotations to pull down from Firehose - -FIREHOSE_ANNOTATIONS=(reference_file interval_list sample_id recalibrated_bam_file squid_project collaborator_id) - -index=0 -count=${#FIREHOSE_ANNOTATIONS[@]} -TSV_HEADER="" -FIREHOSE_VARIABLES="" -TAB=' ' - -# Build the tab separated list of firehose arguments - -while [ "$index" -lt "$count" ]; do - if [ "$FIREHOSE_VARIABLES" != "" ]; then - FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB - TSV_HEADER=$TSV_HEADER$TAB - fi - FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}' - TSV_HEADER=$TSV_HEADER${FIREHOSE_ANNOTATIONS[$index]} - let "index = $index + 1" -done - -# Retrieve all the required variables via the test harness. -$FIREHOSE_TEST_HARNESS \ - -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \ - -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \ - "echo '$FIREHOSE_VARIABLES'" && \ -\ -# Generate tsv header -echo "$TSV_HEADER" > $PIPELINE_TSV_FILE \ -# Generate tsv from firehose output -. firehose-populated-commands.sh >> $PIPELINE_TSV_FILE - -EXIT_CODE=$? - -if [[ $EXIT_CODE -ne 0 ]]; then - echo "" >&2 - echo "The Firehose test harness failed with exit code:" $EXIT_CODE >&2 - echo 'Check the name of your Sample_Set or try using the newer getFirehoseCurlTsv.sh' >&2 - exit $EXIT_CODE -fi diff --git a/shell/firehose/pipelineTsvToYaml.sh b/shell/firehose/pipelineTsvToYaml.sh deleted file mode 100755 index 5fce320da..000000000 --- a/shell/firehose/pipelineTsvToYaml.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/sh - -# Uses awk to generate a YAML file from a TSV. - -# In the awk script and templates: -# - Variables starting with a '$' are columns in the TSV -# - Variables without a '$' are pre-calculated from the first row of data - - -# TSV file to read -PIPELINE_TSV_FILE=$1 - -if [ "$PIPELINE_TSV_FILE" == "" ]; then - echo "Usage: $0 .tsv" >&2 - exit 1 -fi - -ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`) -if [[ ${ROW_COUNT[0]} -lt 2 ]]; then - echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2 - exit 1 -fi - -# YAML file to write -PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml - -# YAML templates - -# Project YAML template, once per file. -PROJECT_YAML_TEMPLATE='"\n\ - project: {\n\ - name: %s,\n\ - referenceFile: %s,\n\ - genotypeDbsnp: %s,\n\ - evalDbsnp: %s,\n\ - refseqTable: %s,\n\ - intervalList: %s\n\ - },", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList' - -# Project YAML template, once per sample. -SAMPLE_YAML_TEMPLATE='"\n\ - {\n\ - id: %s,\n\ - bamFiles: { cleaned: %s },\n\ - tags: {\n\ - SQUIDProject: %s,\n\ - CollaboratorID: %s\n\ - }\n\ - }", $sampleId, $bamFile, $squidProject, $collaboratorId' - -TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c` -if [ "$TEST_AWK_COUNT" -eq 2 ]; then - # Strip the extra \n from the lines if awk of \n is - # a newline and not the two characters slash-n (on mac) - PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}" - SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}" -fi - -# Generate yaml from tsv -awk ' -{ - if (NR == 1) { - tsvFile = "'$PIPELINE_TSV_FILE'" - - # Set the project name to the TSV file minus the directory and the .tsv - projectName = tsvFile - sub(/\/.*\//, "", projectName) - sub(/\.tsv/, "", projectName) - - # Read the column headers and figure out the index of each column name. - for (i=1; i<=NF; i++) - columnFields[tolower($i)] = i - - referenceFile = columnFields["reference_file"] - intervalList = columnFields["interval_list"] - sampleId = columnFields["sample_id"] - squidProject = columnFields["squid_project"] - collaboratorId = columnFields["collaborator_id"] - - for (key in columnFields) - if (key ~ "bam_file") - bamFile = columnFields[key] - - if (referenceFile == "") { - print "ERROR: Column header reference_file missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (intervalList == "") { - print "ERROR: Column header interval_list missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (sampleId == "") { - print "ERROR: Column header sample_id missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (squidProject == "") { - print "ERROR: Column header squid_project missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (collaboratorId == "") { - print "ERROR: Column header collaborator_id missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (bamFile == "") { - print "ERROR: Column header *bam_file* missing from " tsvFile > "/dev/stderr" - exitWithError = 1 - } - - if (exitWithError) { - exit 1 - } - - refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/" - dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/" - - # add hg18 specific files to awk associative arrays - genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod" - evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod" - refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt" - - # add hg19 specific files to awk associative arrays - genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf" - evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.vcf" - refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt" - - printf "{" - } else { - missingValue = 0 - if ($referenceFile == "") missingValue = 1 - if ($intervalList == "") missingValue = 1 - if ($sampleId == "") missingValue = 1 - if ($squidProject == "") missingValue = 1 - if ($collaboratorId == "") missingValue = 1 - if ($bamFile == "") missingValue = 1 - - if (missingValue) { - print "WARNING: Skipping row which does not have all values: " $0 > "/dev/stderr" - } else { - if (NR == 2) { - # Based on the reference of the first sample, specify the dbsnps and refseq tables. - - referencePartCount = split($referenceFile, referenceParts, "/") - referenceName = referenceParts[referencePartCount] - - genotypeDbsnp = genotypeDbsnps[referenceName] - evalDbsnp = evalDbsnps[referenceName] - refseq = refseqs[referenceName] - - printf '"$PROJECT_YAML_TEMPLATE"' - printf "\n samples: [" - } else { - printf "," - } - printf '"$SAMPLE_YAML_TEMPLATE"' - } - } -} -END { - if (NR > 0) - printf "\n ]" - print "\n}" -}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE" diff --git a/shell/pipelineJobs.csh b/shell/pipelineJobs.csh deleted file mode 100755 index 85b870526..000000000 --- a/shell/pipelineJobs.csh +++ /dev/null @@ -1,8 +0,0 @@ -echo "Running" -bjobs -u gsaadm -r | grep gsaadm | awk '{print $4}' | sort | uniq -c - -echo "Pending" -bjobs -u gsaadm -p | grep gsaadm | awk '{print $4}' | sort | uniq -c - -echo "Suspended" -bjobs -u gsaadm -s | grep gsaadm | awk '{print $4}' | sort | uniq -c diff --git a/shell/queue.sh b/shell/queue.sh deleted file mode 100755 index 3b2826d92..000000000 --- a/shell/queue.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh - -SAMPLE_SET=$1 -STING_HOME=/humgen/gsa-pipeline/.repository -JOB_QUEUE=week -SHORT_QUEUE=hour -TMP_DIR=$PWD/tmp - -mkdir $PWD/tmp -source /broad/software/scripts/useuse -use LSF -use R-2.10 -use Oracle-full-client -use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1 - -java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2 diff --git a/shell/queueStatus.csh b/shell/queueStatus.csh deleted file mode 100755 index b0c0889a3..000000000 --- a/shell/queueStatus.csh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/tcsh - -# what's the status of all of the gsa hosts -echo "GSA host status" -bhosts gsahosts - -echo "\nGSA queue usage" -bjobs -u all -q gsa | awk '$2 !~ "USER" {print $2}' | sort | uniq -c - -echo "\nGeneral computing resources" -bqueues gsa week short broad - -echo "\nFH jobs" -bjobs -u gsa-adm - -echo "\nFile system status" -ls /humgen/gsa-scr1 /humgen/1kg /humgen/gsa-hpprojects /humgen/gsa-hphome1 /humgen/gsa-pipeline /humgen/gsa-firehose2 /humgen/gsa-lpprojects - -df -h /humgen/* /broad/shptmp diff --git a/shell/runGATKReport.csh b/shell/runGATKReport.csh deleted file mode 100755 index d12a32afe..000000000 --- a/shell/runGATKReport.csh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/tcsh - -source /broad/tools/scripts/useuse - -reuse Python-2.5 -use R-2.11 - -setenv DIR /humgen/gsa-hpprojects/GATK/reports -setenv ARCHIVE_DIR $DIR/archive -setenv SUMMARY_DIR $DIR/summaries -setenv DATE `date +"%m_%d_%Y"` -setenv ARCHIVE $ARCHIVE_DIR/$DATE -setenv SUMMARY $SUMMARY_DIR/$DATE -setenv GATK ~/dev/GenomeAnalysisTK/trunk -setenv GATK_RELEASE_VERSION `ls -l /humgen/gsa-hpprojects/GATK/bin/current | sed 's/.*GenomeAnalysisTK-//'` -setenv REPORT_TXT $DIR/report.txt - -rm -f $REPORT_TXT - -cd $DIR - -echo "\n####################\nArchiving recently submitted jobs" >> $REPORT_TXT -python $GATK/python/analyzeRunReports.py archive $DIR/submitted -o $ARCHIVE.gz -D >> $REPORT_TXT - -echo "\n####################\nReleased version ($GATK_RELEASE_VERSION), all runs" >> $REPORT_TXT -python $GATK/python/analyzeRunReports.py summary $ARCHIVE_DIR/*.gz --rev $GATK_RELEASE_VERSION >> $REPORT_TXT -python $GATK/python/analyzeRunReports.py exceptions $ARCHIVE_DIR/*.gz -E sting --rev $GATK_RELEASE_VERSION >> $REPORT_TXT - -echo "\n####################\nLast day, all versions" >> $REPORT_TXT -python $GATK/python/analyzeRunReports.py summary $ARCHIVE.gz --max_days 1 --no-dev >> $REPORT_TXT -python $GATK/python/analyzeRunReports.py exceptions $ARCHIVE.gz --max_days 1 -E sting --no-dev >> $REPORT_TXT - -#echo "Archive directory contents" -#du -sh $ARCHIVE_DIR - -if (1 == 0) then -foreach maxDays ( 30 360 ) - echo "Creating table" - setenv table $ARCHIVE.${maxDays}_days.table - python $GATK/python/analyzeRunReports.py table $ARCHIVE_DIR/*.gz -o $table --max_days $maxDays - - echo "Creating summary" - Rscript $GATK/R/GATKRunReport.R $table $SUMMARY.${maxDays}_days.pdf "of previous $maxDays days" - - echo "Creating exception report" - python $GATK/python/analyzeRunReports.py exceptions $ARCHIVE_DIR/*.gz -o $SUMMARY.${maxDays}_days.sting.exceptions.txt --max_days $maxDays -E sting --no-dev - python $GATK/python/analyzeRunReports.py exceptions $ARCHIVE_DIR/*.gz -o $SUMMARY.${maxDays}_days.user.exceptions.txt --max_days $maxDays -E user --no-dev - - rm $table -end -endif - -#echo "GATK daily run report" | mutt -a $SUMMARY.30_days.pdf -a $SUMMARY.360_days.pdf -a $SUMMARY.7_days.pdf -s "GATK Run report PDFs for $DATE" gsamembers -#cat $REPORT_TXT | mutt -a $REPORT_TXT -a $SUMMARY.30_days.pdf -a $SUMMARY.360_days.pdf -s "GATK run report for $DATE" gsamembers -cat $REPORT_TXT | mutt -a $REPORT_TXT -s "GATK run report for $DATE" gsamembers - diff --git a/shell/runbeagle.sh b/shell/runbeagle.sh deleted file mode 100755 index 8dce11421..000000000 --- a/shell/runbeagle.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash - - -chr=$1 - -# set basic input and output paths. User should modify the following to point to: -# a) Input vcf, -# b) Output base path for all files -# c) Location of beagle.jar -# d) Path to reference file -beagle_prefix="CEUTSI.chr${chr}.bgl" -beagledir="/broad/shptmp/username/beagle/CEUTSI_Pilot1/chr${chr}/" -beaglejar="../../beagle/beagle.jar" - -input_vcf="/broad/shptmp/username/beagle/CEUTSI_Pilot1/CEUTSI.recal.filtered.vcf" -tmpdir="/broad/shptmp/username/tmp" -bgloutprefix="recal.filtered" -reffile="/broad/1KG/reference/human_g1k_v37.fasta" - - -# Set to one to determine which sections of beagle pipeline to run -runinput=1 # Run VCF to Beagle input converter -runbgl=1 # Run Beagle -runoutput=1 # run Beagle output-to-VCF converter -runvarianteval=1 # Run Variant Evaluator to measure Beagle performance. - -# Reference files for variant evaluator -dohapmap=1 -do1kgchip=1 - -# Path to HapMap/1KG Chip truth sets for variant evaluator -if [ $dohapmap == 1 ] -then - cmpfileh="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf" -fi - -if [ $do1kgchip == 1 ] -then - cmpfile1="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_chip_jan2010/1000Genome.chip.hg19.filtered.vcf" -fi - -outputbglvcf=$beagledir$bgloutprefix.$beagle_prefix.output.vcf - - -# check if Beagle directory exists. If not, create it. -if [ ! -d $beagledir ] -then - echo "Creating Beagle directory $beagledir" - mkdir $beagledir -fi - -if [ $runinput == 1 ] -then - echo "Running GATK to create Beagle input" - - java -Xmx4000m -jar ../dist/GenomeAnalysisTK.jar -L $chr -l INFO \ - -R $reffile -T ProduceBeagleInput \ - -B vcf,VCF,$input_vcf \ - -beagle $beagledir$beagle_prefix -fi - -# now, run beagle -if [ $runbgl == 1 ] -then - echo "Running Beagle..." - java -Xmx8000m -Djava.io.tmpdir=$tmpdir -jar $beaglejar like=$beagledir$beagle_prefix out=$bgloutprefix - - # move output files to beagle directory - cp ./$bgloutprefix.* $beagledir - # unzip gzip'd files, force overwrite if existing - gunzip -f $beagledir$bgloutprefix.$beagle_prefix.gprobs.gz - gunzip -f $beagledir$bgloutprefix.$beagle_prefix.phased.gz - #rename also Beagle likelihood file to mantain consistency - cp $beagledir$beagle_prefix $beagledir$bgloutprefix.$beagle_prefix.like - cp $beagledir$beagle_prefix.log $beagledir$bgloutprefix.$beagle_prefix.log -fi - -# run GATK to parse Beagle files and to produce output vcf -if [ $runoutput == 1 ] -then - java -Xmx4000m -Djava.io.tmpdir=$tmpdir -jar ../dist/GenomeAnalysisTK.jar \ - -R $reffile -T BeagleOutputToVCF -l INFO -L $chr \ - -B inputvcf,VCF,$input_vcf \ - -B beagleR2,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.r2 \ - -B beagleProbs,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.gprobs \ - -B beaglePhased,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.phased \ - -output $beagledir$bgloutprefix.$beagle_prefix.output.vcf -fi - -if [ $runvarianteval == 1 ] -then - # finally, run VariantEval to produce useful comparisons between pre-and post-Beagle vcf's. - if [ $dohapmap == 1 ] - then - java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ - -R $reffile -l INFO -L $chr \ - -B eval_prebeagle,VCF,$input_vcf \ - -B eval_beagle,VCF,$outputbglvcf \ - -B comp_hapmap,VCF,$cmpfileh \ - -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_hapmap_grep.txt - fi - - if [ $do1kgchip == 1 ] - then - java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ - -R $reffile -l INFO -L $chr \ - -B eval_prebeagle,VCF,$input_vcf \ - -B eval_beagle,VCF,$outputbglvcf \ - -B comp_1kgchip,VCF,$cmpfile1 \ - -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_1kgchip_grep.txt - fi -fi - - diff --git a/shell/syncWithDevOnGSA2.csh b/shell/syncWithDevOnGSA2.csh deleted file mode 100755 index 4f2445ee2..000000000 --- a/shell/syncWithDevOnGSA2.csh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/tcsh - -setenv HERE "java tribble scala analysis" -setenv THERE \~/dev/GenomeAnalysisTKFromLaptop/trunk - -rsync -e ssh -aCvz $HERE depristo@gsa1:$THERE