From efd99c3c117e2a423c93e9390076d1b9437d406b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 11:32:06 -0400 Subject: [PATCH 03/31] new home for the core qscripts --- .../sting/queue/qscripts}/DataProcessingPipeline.scala | 2 +- .../sting/queue/qscripts}/GATKResourcesBundle.scala | 2 +- .../queue/qscripts}/MethodsDevelopmentCallingPipeline.scala | 2 ++ .../sting/queue/qscripts}/RecalibrateBaseQualities.scala | 2 +- .../sting/queue/qscripts}/StandardVariantEvaluation.scala | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/DataProcessingPipeline.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/GATKResourcesBundle.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/MethodsDevelopmentCallingPipeline.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/RecalibrateBaseQualities.scala (98%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/StandardVariantEvaluation.scala (99%) diff --git a/public/scala/qscript/core/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala similarity index 99% rename from public/scala/qscript/core/DataProcessingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index cad19bd23..fdde5e8a2 100755 --- a/public/scala/qscript/core/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript diff --git a/public/scala/qscript/core/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala similarity index 99% rename from public/scala/qscript/core/GATKResourcesBundle.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ae045d3..150d78019 100755 --- a/public/scala/qscript/core/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala similarity index 99% rename from public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 632c03499..d71c6ae5d 100755 --- a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts + import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript diff --git a/public/scala/qscript/core/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala similarity index 98% rename from public/scala/qscript/core/RecalibrateBaseQualities.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index ba13d267a..678b356ee 100755 --- a/public/scala/qscript/core/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/core/StandardVariantEvaluation.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala similarity index 99% rename from public/scala/qscript/core/StandardVariantEvaluation.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala index d6af7019e..d333e1dc0 100755 --- a/public/scala/qscript/core/StandardVariantEvaluation.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk.RodBind From 352c38fc0b48d8910ad3346eff0313b2f0890b06 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 11:55:56 -0400 Subject: [PATCH 05/31] Updated to reflect dbsnp conversion fix --- .../walkers/indels/RealignerTargetCreatorIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index acfcbab7b..4b225aaea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e")); + Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( From 9e234cf5d6b8cf3439fc0703248c637c3119f1e2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 13:17:14 -0400 Subject: [PATCH 06/31] This is a temporary commit for Picard. It will absolutely break integration tests, but I'm going to revert it in 1 minute. Because we don't want them in unstable, I need to push this into stable. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 3cfa58c02..9816e17f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -99,7 +99,7 @@ public class IndelRealigner extends ReadWalker { protected SAMFileWriter writerToUse = null; @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_SW; // ADVANCED OPTIONS FOLLOW @@ -167,7 +167,8 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) protected boolean DEPRECATED_REALIGN_MATES = false; - @Deprecated + //@Deprecated + @Hidden @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", doc="This argument is no longer used. See --consensusDeterminationModel instead.", required=false) protected boolean DEPRECATED_KNOWNS_ONLY = false; @@ -261,6 +262,9 @@ public class IndelRealigner extends ReadWalker { public void initialize() { + if ( DEPRECATED_KNOWNS_ONLY ) + consensusModel = ConsensusDeterminationModel.KNOWNS_ONLY; + if ( N_WAY_OUT == null && writer == null ) { throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); } From 804d5f22d522516ea787e3873a7bfbaf3c2035a0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 13:18:30 -0400 Subject: [PATCH 07/31] Reverting previous change, as promised. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 9816e17f7..a53665d64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -99,7 +99,7 @@ public class IndelRealigner extends ReadWalker { protected SAMFileWriter writerToUse = null; @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_SW; + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; // ADVANCED OPTIONS FOLLOW @@ -167,8 +167,7 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) protected boolean DEPRECATED_REALIGN_MATES = false; - //@Deprecated - @Hidden + @Deprecated @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", doc="This argument is no longer used. See --consensusDeterminationModel instead.", required=false) protected boolean DEPRECATED_KNOWNS_ONLY = false; @@ -262,9 +261,6 @@ public class IndelRealigner extends ReadWalker { public void initialize() { - if ( DEPRECATED_KNOWNS_ONLY ) - consensusModel = ConsensusDeterminationModel.KNOWNS_ONLY; - if ( N_WAY_OUT == null && writer == null ) { throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); } @@ -1015,7 +1011,7 @@ public class IndelRealigner extends ReadWalker { sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); } else { - throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed"); + throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); } if ( reference.length - refIdx > 0 ) From f4463d38ca28209ad59b3c5cb7579c3f104fab5c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:29:21 -0400 Subject: [PATCH 08/31] BWA requires pair ended reads to be sorted by read names when operating over BAM files, but Picard sorts by coordinate, so in case we use BWA in pair ended reads, the pipeline now resorts the BAM in read name order, realigns it then sorts it in coordinate order. --- .../queue/qscripts/DataProcessingPipeline.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index fdde5e8a2..623181de6 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -4,12 +4,12 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord} - import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ +import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.SAMFileHeader.SortOrder class DataProcessingPipeline extends QScript { @@ -180,6 +180,7 @@ class DataProcessingPipeline extends QScript { var realignedBams: List[File] = List() var index = 1 for (bam <- bams) { + val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" ) val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") @@ -190,11 +191,12 @@ class DataProcessingPipeline extends QScript { bwa_sam_se(bam, saiFile1, realignedSamFile)) } else { - add(bwa_aln_pe(bam, saiFile1, 1), - bwa_aln_pe(bam, saiFile2, 2), - bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile)) + add(sortSam(bam, readSortedBam, SortOrder.queryname), + bwa_aln_pe(readSortedBam, saiFile1, 1), + bwa_aln_pe(readSortedBam, saiFile2, 2), + bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile)) } - add(sortSam(realignedSamFile, realignedBamFile)) + add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate)) addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam)) realignedBams :+= rgRealignedBamFile index = index + 1 @@ -385,9 +387,10 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".joinBams" } - case class sortSam (inSam: File, outBam: File) extends SortSam { + case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam { this.input = List(inSam) this.output = outBam + this.sortOrder = sortOrderP this.memoryLimit = 4 this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".sortSam" From 197b7141c198b40b23c058414379fdba2fb47450 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:41:57 -0400 Subject: [PATCH 09/31] Added an optional argument -bt for BWA to run multithreaded. --- .../sting/queue/qscripts/DataProcessingPipeline.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 623181de6..f9369ee3f 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript { @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) var useBWApe: Boolean = false + @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) + var bwaThreads: Int = 1 /**************************************************************************** @@ -433,7 +435,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai - def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai this.analysisName = queueLogDir + outSai + ".bwa_aln_se" this.jobName = queueLogDir + outSai + ".bwa_aln_se" } @@ -441,7 +443,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file for 1st mating pair") var sai = outSai1 - def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1" this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } From f4ae6edb9240ee80debba64693c678d782b23ca1 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Jun 2011 14:55:25 -0400 Subject: [PATCH 10/31] Moving some of the released R scripts into public from private --- .../R/plot_Annotations_BinnedTruthMetrics.R | 190 ++++++++++++++++++ public/R/plot_Tranches.R | 87 ++++++++ public/R/plot_residualError_OtherCovariate.R | 108 ++++++++++ ...plot_residualError_QualityScoreCovariate.R | 70 +++++++ public/R/titvFPEst.R | 138 +++++++++++++ 5 files changed, 593 insertions(+) create mode 100644 public/R/plot_Annotations_BinnedTruthMetrics.R create mode 100755 public/R/plot_Tranches.R create mode 100644 public/R/plot_residualError_OtherCovariate.R create mode 100644 public/R/plot_residualError_QualityScoreCovariate.R create mode 100755 public/R/titvFPEst.R diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R new file mode 100644 index 000000000..9f9ee290c --- /dev/null +++ b/public/R/plot_Annotations_BinnedTruthMetrics.R @@ -0,0 +1,190 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] +minBinCutoff = as.numeric(args[3]) +medianNumVariants = args[4] + +c <- read.table(input, header=T) + +all = c[c$numVariants>minBinCutoff & c$category=="all",] +novel = c[c$numVariants>minBinCutoff & c$category=="novel",] +dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] + +# +# Calculate min, max, medians +# + +d = c[c$numVariants>minBinCutoff,] +ymin = min(d$titv) +ymax = max(d$titv) +xmin = min(d$value) +xmax = max(d$value) +m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) +ma = all[all$value > m,] +mb = all[all$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +if(medianNumVariants == "true") { +vc = cumsum( all$numVariants/sum(all$numVariants) ) +m10 = all$value[ max(which(vc<=0.10)) ] +m25 = all$value[ max(which(vc<=0.25)) ] +m = all$value[ max(which(vc<=0.5)) ] +m75 = all$value[ min(which(vc>=0.75)) ] +m90 = all$value[ min(which(vc>=0.90)) ] +} + +# +# Plot TiTv ratio as a function of the annotation +# + +outfile = paste(input, ".TiTv.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot TiTv ratio as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".TiTv_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation +# + +ymin = min(all$dbsnp) +ymax = max(all$dbsnp) +outfile = paste(input, ".truthRate.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +t = all[all$truePositive>0,] +yLabel = "DBsnp/True Positive Rate" +ymin = min(min(all$dbsnp),min(t$truePositive)) +ymax = max(max(all$dbsnp),max(t$truePositive)) +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot histogram of the annotation's value +# + +outfile = paste(input, ".Histogram.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() + +# +# Plot histogram of the annotation's value, log scale on x-axis +# + +outfile = paste(input, ".Histogram_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R new file mode 100755 index 000000000..a79ddd3ab --- /dev/null +++ b/public/R/plot_Tranches.R @@ -0,0 +1,87 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +tranchesFile = args[1] +targetTITV = as.numeric(args[2]) +targetSensitivity = as.numeric(args[3]) +suppressLegend = ! is.na(args[4]) + +# ----------------------------------------------------------------------------------------------- +# Useful general routines +# ----------------------------------------------------------------------------------------------- + +MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate + +titvFPEst <- function(titvExpected, titvObserved) { + max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) +} + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +nTPFP <- function(nVariants, FDR) { + return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100))) +} + +leftShift <- function(x, leftValue = 0) { + r = rep(leftValue, length(x)) + for ( i in 1:(length(x)-1) ) { + #print(list(i=i)) + r[i] = x[i+1] + } + r +} + +# ----------------------------------------------------------------------------------------------- +# Tranches plot +# ----------------------------------------------------------------------------------------------- +data2 = read.table(tranchesFile,sep=",",head=T) +data2 = data2[order(data2$novelTiTv, decreasing=F),] +#data2 = data2[order(data2$FDRtranche, decreasing=T),] +cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") +density=c(20, -1, -1, 20) +outfile = paste(tranchesFile, ".pdf", sep="") +pdf(outfile, height=5, width=8) +par(mar = c(5, 5, 4, 2) + 0.1) +novelTiTv = c(data2$novelTITV,data2$novelTiTv) +alpha = 1 - titvFPEstV(targetTITV, novelTiTv) +#print(alpha) + +numGood = round(alpha * data2$numNovel); + +#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100)) +numBad = data2$numNovel - numGood; + +numPrevGood = leftShift(numGood, 0) +numNewGood = numGood - numPrevGood +numPrevBad = leftShift(numBad, 0) +numNewBad = numBad - numPrevBad + +d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE) +#print(d) +barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000)) +#abline(v= d[2,dim(d)[2]], lty=2) +#abline(v= d[1,3], lty=2) +if ( ! suppressLegend ) + legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) + +mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0) +axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) + +# plot sensitivity vs. specificity +sensitivity = data2$truthSensitivity +if ( ! is.null(sensitivity) ) { + #specificity = titvFPEstV(targetTITV, novelTiTv) + specificity = novelTiTv + plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)") + abline(h=targetTITV, lty=2) + abline(v=targetSensitivity, lty=2) + #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2) +} + +dev.off() diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R new file mode 100644 index 000000000..a1385ff3f --- /dev/null +++ b/public/R/plot_residualError_OtherCovariate.R @@ -0,0 +1,108 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +covariateName = args[2] + +outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +c <- c[sort.list(c[,1]),] + +# +# Plot residual error as a function of the covariate +# + +d.good <- c[c$nBases >= 1000,] +d.1000 <- c[c$nBases < 1000,] +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(d.good$nBases) == length(c$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)) +} +# Don't let residual error go off the edge of the plot +d.good$residualError = d.good$Qempirical-d.good$Qreported +d.good$residualError[which(d.good$residualError > 10)] = 10 +d.good$residualError[which(d.good$residualError < -10)] = -10 +d.1000$residualError = d.1000$Qempirical-d.1000$Qreported +d.1000$residualError[which(d.1000$residualError > 10)] = 10 +d.1000$residualError[which(d.1000$residualError < -10)] = -10 +c$residualError = c$Qempirical-c$Qreported +c$residualError[which(c$residualError > 10)] = 10 +c$residualError[which(c$residualError < -10)] = -10 +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") +} +dev.off() + + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + +# +# Plot histogram of the covariate +# + +e = d.good +f = d.1000 +outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) + +lwdSize=2 +if( length(c$Covariate) <= 20 ) { + lwdSize=7 +} else if( length(c$Covariate) <= 70 ) { + lwdSize=4 +} + +if( is.numeric(c$Covariate) ) { + if( length(hst$e.Covariate) == 0 ) { + plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + } else { + plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) + plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") + if( length(hst$c.Covariate) > 9 ) { + axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) + } else { + axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} +dev.off() diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R new file mode 100644 index 000000000..81bc9460d --- /dev/null +++ b/public/R/plot_residualError_QualityScoreCovariate.R @@ -0,0 +1,70 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) + +input = args[1] +Qcutoff = as.numeric(args[2]) +maxQ = as.numeric(args[3]) +maxHist = as.numeric(args[4]) + +t=read.table(input, header=T) + +# +# Plot of reported quality versus empirical quality +# + +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] +d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] +d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] +f <- t[t$Qreported < Qcutoff,] +e <- rbind(d.good, d.1000, d.10000) +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)); +} +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) +points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) +points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) +abline(0,1, lty=2) +dev.off() + +# +# Plot Q empirical histogram +# + +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) +percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) +entropy = -sum(log2(percentBases)*percentBases) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# + +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R new file mode 100755 index 000000000..7af5e8bbb --- /dev/null +++ b/public/R/titvFPEst.R @@ -0,0 +1,138 @@ +titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { + TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel + 2 * TP / 3 / callable +} + +marginalTiTv <- function( nx, titvx, ny, titvy ) { + tvx = nx / (titvx + 1) + tix = nx - tvx + tvy = ny / (titvy + 1) + tiy = ny - tvy + tiz = tix - tiy + tvz = tvx - tvy + return(tiz / tvz) +} +marginaldbSNPRate <- function( nx, dbx, ny, dby ) { + knownx = nx * dbx / 100 + novelx = nx - knownx + knowny = ny * dby / 100 + novely = ny - knowny + knownz = knownx - knowny + novelz = novelx - novely + return(knownz / ( knownz + novelz ) * 100) +} + +numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { + nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) + return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) +} + +normalize <- function(x) { + x / sum(x) +} + +normcumsum <- function(x) { + cumsum(normalize(x)) +} + +cumhist <- function(d, ...) { + plot(d[order(d)], type="b", col="orange", lwd=2, ...) +} + +revcumsum <- function(x) { + return(rev(cumsum(rev(x)))) +} + +phred <- function(x) { + log10(max(x,10^(-9.9)))*-10 +} + +pOfB <- function(b, B, Q) { + #print(paste(b, B, Q)) + p = 1 - 10^(-Q/10) + if ( b == B ) + return(p) + else + return(1 - p) +} + +pOfG <- function(bs, qs, G) { + a1 = G[1] + a2 = G[2] + + log10p = 0 + for ( i in 1:length(bs) ) { + b = bs[i] + q = qs[i] + p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 + log10p = log10p + log10(p1) + } + + return(log10p) +} + +pOfGs <- function(nAs, nBs, Q) { + bs = c(rep("a", nAs), rep("t", nBs)) + qs = rep(Q, nAs + nBs) + G1 = c("a", "a") + G2 = c("a", "t") + G3 = c("t", "t") + + log10p1 = pOfG(bs, qs, G1) + log10p2 = pOfG(bs, qs, G2) + log10p3 = pOfG(bs, qs, G3) + Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) + + return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) +} + +QsampleExpected <- function(depth, Q) { + weightedAvg = 0 + for ( d in 1:(depth*3) ) { + Qsample = 0 + pOfD = dpois(d, depth) + for ( nBs in 0:d ) { + pOfnB = dbinom(nBs, d, 0.5) + nAs = d - nBs + Qsample = pOfGs(nAs, nBs, Q)$Qsample + #Qsample = 1 + weightedAvg = weightedAvg + Qsample * pOfD * pOfnB + print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) + } + } + + return(weightedAvg) +} + +plotQsamples <- function(depths, Qs, Qmax) { + cols = rainbow(length(Qs)) + plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") + + for ( i in 1:length(Qs) ) { + Q = Qs[i] + y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) + points(depths, y, col=cols[i], type="b") + } + + legend("topleft", paste("Q", Qs), fill=cols) +} + +pCallHetGivenDepth <- function(depth, nallelesToCall) { + depths = 0:(2*depth) + pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) + dpois(depths,depth)*(1-pNoAllelesToCall) +} + +pCallHets <- function(depth, nallelesToCall) { + sum(pCallHetGivenDepth(depth,nallelesToCall)) +} + +pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { + 1-(1-pCallHets(depth,nallelesToCall))^nsamples +} From 2cb1376ed0acb2fea84d5d38a6c524e603ed269b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:55:39 -0400 Subject: [PATCH 11/31] VCFStreaming was failing integration tests because now select variants outputs the samples in alphabetical order, instead of random as before. Fixed the MD5. --- .../gatk/walkers/variantutils/VCFStreamingIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 15733e104..4ed9718fd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877") + Arrays.asList("995c07ccd593fe1c35d0d28155112a55") ); executeTest("testSimpleVCFStreaming", spec); From defa3cfe852c1a1e6a0a01a4cffb59353e2dbeed Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Thu, 30 Jun 2011 14:59:58 -0400 Subject: [PATCH 12/31] Moved around private walkers into appropriate directories in private gatk.walkers. Moved a few public walkers into private qc package, and some private qc walkers into the public directory. Removed several obviously broken and/or unused walkers. --- .../walkers/indels/RealignedReadCounter.java | 143 +++++++++ .../sting/gatk/walkers/qc/CountIntervals.java | 62 ++++ .../gatk/walkers/qc/ProfileRodSystem.java | 157 --------- .../walkers/qc/RodSystemValidationWalker.java | 153 +++++++++ .../gatk/walkers/qc/ValidateBAQWalker.java | 298 ------------------ .../sting/utils/baq/BAQUnitTest.java | 21 +- 6 files changed, 373 insertions(+), 461 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java new file mode 100755 index 000000000..fc196e712 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.File; +import java.util.*; + +@By(DataSource.READS) +// walker to count realigned reads +public class RealignedReadCounter extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + + @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + protected String intervalsFile = null; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; + private boolean intervalWasUpdated = false; + + public void initialize() { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + return 0; + } + + GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) + return 0; + + if ( readLoc.overlapsP(currentInterval) ) { + if ( doNotTryToClean(read) ) + return 0; + + if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { + String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); + // deal with an old bug + if ( read.getCigar().toString().equals(newCigar) ) { + //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); + return 0; + } + + if ( !intervalWasUpdated ) { + intervalWasUpdated = true; + updatedIntervals++; + affectedBases += 20 + getIndelSize(read); + } + updatedReads++; + + } + } else { + do { + intervalWasUpdated = false; + currentInterval = intervals.hasNext() ? intervals.next() : null; + } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + } + + return 0; + } + + private int getIndelSize(SAMRecord read) { + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.I ) + return 0; + if ( ce.getOperator() == CigarOperator.D ) + return ce.getLength(); + } + logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); + return 0; + } + + private boolean doNotTryToClean(SAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + (BadMateFilter.hasBadMate(read)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(updatedIntervals + " intervals were updated"); + System.out.println(updatedReads + " reads were updated"); + System.out.println(affectedBases + " bases were affected"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java new file mode 100755 index 000000000..feb5f62af --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.List; +import java.io.PrintStream; + +/** + * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but + * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. + * This was its very first use. + */ +public class CountIntervals extends RefWalker { + @Output + PrintStream out; + + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) + int numOverlaps = 2; + + public Long reduceInit() { + return 0l; + } + + public boolean isReduceByInterval() { return true; } + + public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) { + return null; + } + + List checkIntervals = tracker.getGATKFeatureMetaData("check",false); + return (long) checkIntervals.size(); + } + + public Long reduce(Long loc, Long prev) { + if ( loc == null ) { + return 0l; + } else { + return Math.max(prev,loc); + } + } + + public void onTraversalDone(List> finalReduce) { + long count = 0; + for ( Pair g : finalReduce ) { + if ( g.second >= numOverlaps) { + count ++; + } + } + out.printf("Number of contiguous intervals: %d",count); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java deleted file mode 100755 index 67879a442..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SimpleTimer; - -import java.io.PrintStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value={}) -public class ProfileRodSystem extends RodWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false) - int nIterations = 1; - - @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) - int MAX_RECORDS = -1; - - SimpleTimer timer = new SimpleTimer("myTimer"); - - public void initialize() { - File rodFile = getRodFile(); - - out.printf("# walltime is in seconds%n"); - out.printf("# file is %s%n", rodFile); - out.printf("# file size is %d bytes%n", rodFile.length()); - out.printf("operation\titeration\twalltime%n"); - for ( int i = 0; i < nIterations; i++ ) { - out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE)); - out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE)); - out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS)); - out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC)); - out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE)); - } - - timer.start(); // start up timer for map itself - } - - private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE }; - - private final double readFile(File f, ReadMode mode) { - timer.start(); - - try { - byte[] data = new byte[100000]; - FileInputStream s = new FileInputStream(f); - - if ( mode == ReadMode.BY_BYTE ) { - while (true) { - if ( s.read(data) == -1 ) - break; - } - } else { - int counter = 0; - VCFCodec codec = new VCFCodec(); - String[] parts = new String[100000]; - AsciiLineReader lineReader = new AsciiLineReader(s); - - if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE ) - codec.readHeader(lineReader); - - while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) { - String line = lineReader.readLine(); - if ( line == null ) - break; - else if ( mode == ReadMode.BY_PARTS ) { - ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR); - } - else if ( mode == ReadMode.DECODE_LOC ) { - codec.decodeLoc(line); - } - else if ( mode == ReadMode.DECODE ) { - processOneVC((VariantContext)codec.decode(line)); - } - } - } - } catch ( Exception e ) { - throw new RuntimeException(e); - } - - return timer.getElapsedTime(); - } - - private File getRodFile() { - List rods = this.getToolkit().getRodDataSources(); - ReferenceOrderedDataSource rod = rods.get(0); - return rod.getFile(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation()); - processOneVC(vc); - - return 0; - } - - private static final void processOneVC(VariantContext vc) { - vc.getNSamples(); // force us to parse the samples - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java new file mode 100644 index 000000000..9cb715507 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Collection; +import java.util.List; + +/** + * a walker for validating (in the style of validating pile-up) the ROD system. + */ +@Reference(window=@Window(start=-40,stop=40)) +public class RodSystemValidationWalker extends RodWalker { + + // the divider to use in some of the text output + private static final String DIVIDER = ","; + + @Output + public PrintStream out; + + @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts") + public boolean allRecordsVariantContextEquivalent = false; + + // used to calculate the MD5 of a file + MessageDigest digest = null; + + // we sometimes need to know what rods the engine's seen + List rodList; + + /** + * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change + * underneath us). + */ + public void initialize() { + // setup the MD5-er + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new ReviewedStingException("Unable to find MD5 checksumer"); + } + out.println("Header:"); + // enumerate the list of ROD's we've loaded + rodList = this.getToolkit().getRodDataSources(); + for (ReferenceOrderedDataSource rod : rodList) { + out.println(rod.getName() + DIVIDER + rod.getType()); + out.println(rod.getName() + DIVIDER + rod.getFile()); + out.println(rod.getName() + DIVIDER + md5sum(rod.getFile())); + } + out.println("Data:"); + } + + /** + * + * @param tracker the ref meta data tracker to get RODs + * @param ref reference context + * @param context the reads + * @return an 1 for each site with a rod(s), 0 otherwise + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int ret = 0; + if (tracker != null && tracker.getAllRods().size() > 0) { + out.print(context.getLocation() + DIVIDER); + Collection features = tracker.getAllRods(); + for (GATKFeature feat : features) + out.print(feat.getName() + DIVIDER); + out.println(";"); + ret++; + } + + // if the argument was set, check for equivalence + if (allRecordsVariantContextEquivalent && tracker != null) { + Collection col = tracker.getAllVariantContexts(ref); + VariantContext con = null; + for (VariantContext contextInList : col) + if (con == null) con = contextInList; + else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con); + } + return ret; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + // Double check traversal result to make count is the same. + // TODO: Is this check necessary? + out.println("[REDUCE RESULT] Traversal result is: " + result); + } + + // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html + private String md5sum(File f) { + InputStream is; + try { + is = new FileInputStream(f); + } catch (FileNotFoundException e) { + return "Not a file"; + } + byte[] buffer = new byte[8192]; + int read = 0; + try { + while ((read = is.read(buffer)) > 0) { + digest.update(buffer, 0, read); + } + byte[] md5sum = digest.digest(); + BigInteger bigInt = new BigInteger(1, md5sum); + return bigInt.toString(16); + } + catch (IOException e) { + throw new RuntimeException("Unable to process file for MD5", e); + } + finally { + try { + is.close(); + } + catch (IOException e) { + throw new RuntimeException("Unable to close input stream for MD5 calculation", e); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java deleted file mode 100755 index eda01bdb4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.PrintStream; - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER) -@Reference(window=@Window(start=-5,stop=5)) -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -public class ValidateBAQWalker extends ReadWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(doc="maximum read length to apply the BAQ calculation too",required=false) - protected int maxReadLen = 1000; - - @Argument(doc="",required=false) - protected int bw = 7; - - @Argument(doc="",required=false) - protected boolean samtoolsMode = false; - - @Argument(doc="only operates on reads with this name",required=false) - protected String readName = null; - - @Argument(doc="If true, all differences are errors", required=false) - protected boolean strict = false; - - @Argument(doc="prints info for each read", required=false) - protected boolean printEachRead = false; - - @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false) - protected boolean alsoPrintWarnings = false; - - @Argument(doc="Include reads without BAQ tag", required=false) - protected boolean includeReadsWithoutBAQTag = false; - - @Argument(doc="x each read is processed", required=false) - protected int magnification = 1; - - @Argument(doc="Profile performance", required=false) - protected boolean profile = false; - - int counter = 0; - - BAQ baqHMM = null; // matches current samtools parameters - - public void initialize() { - if ( samtoolsMode ) - baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true); - else - baqHMM = new BAQ(); - } - - long goodReads = 0, badReads = 0; - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - - if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) { - if ( baqHMM.excludeReadFromBAQ(read) ) - return 0; - - if ( profile ) { - profileBAQ(ref, read); - } else { - validateBAQ(ref, read); - } - - return 1; - } - - return 0; - } - - SimpleTimer tagTimer = new SimpleTimer("from.tag"); - SimpleTimer baqReadTimer = new SimpleTimer("baq.read"); - SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal"); - - private void profileBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - BAQ.BAQCalculationResult baq = null; - - tagTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); } - tagTimer.stop(); - - baqReadTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); } - baqReadTimer.stop(); - - glocalTimer.restart(); - for ( int i = 0; i < magnification; i++ ) - baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY); - glocalTimer.stop(); - } - - - private void validateBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); - if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); - - BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); - - boolean fail = false; - boolean print = false; - int badi = 0; - - if ( BAQ.hasBAQTag(read) ) { - for ( badi = 0; badi < baqFromTag.length; badi++ ) { - if ( baqFromTag[badi] != baq.bq[badi] ) { - if ( cigarLength(read) != read.getReadLength() ) { - print = true; - fail = false; - out.printf(" different, but cigar length != read length%n"); - break; - } - if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { - print = true; - fail = strict; - out.printf(" different, but Q0 base detected%n"); - break; - } - else if (readHasSoftClip(read) && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but soft clip detected%n"); - break; - } else if (readHasDeletion(read) ) { // && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but deletion detected%n"); - break; - } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { - print = fail = true; - out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); - break; - } else { - print = fail = true; - break; - } - } - } - if ( fail || print ) - badReads++; - else - goodReads++; - } - - if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) { - byte[] pos = new byte[baq.bq.length]; - for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i; - - out.printf(" read length : %d%n", read.getReadLength()); - out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart()); - out.printf(" cigar : %s%n", read.getCigarString()); - out.printf(" ref bases : %s%n", new String(baq.refBases)); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.printf(" ref length : %d%n", baq.refBases.length); - out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); - if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true); - printQuals(" original quals: ", read.getBaseQualities(), true); - printQuals(" baq quals: ", baq.bq, true); - printQuals(" positions : ", pos, true); - printQuals(" original quals: ", read.getBaseQualities()); - if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag); - printQuals(" hmm quals: ", baq.bq); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.println(Utils.dupString('-', 80)); - } - - - if ( fail ) - throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d", - read.getReadName(), badi, baqFromTag[badi], baq.bq[badi])); - } - - private final static boolean readHasSoftClip(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.SOFT_CLIP ) - return true; - } - - return false; - } - - private final static boolean readHasDeletion(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.DELETION ) - return true; - } - - return false; - } - - public final void printQuals( String prefix, byte[] quals ) { - printQuals(prefix, quals, false); - } - - public final void printQuals( String prefix, byte[] quals, boolean asInt ) { - printQuals(out, prefix, quals, asInt); - } - - public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { - out.print(prefix); - for ( int i = 0; i < quals.length; i++) { - if ( asInt ) { - out.printf("%2d", (int)quals[i]); - if ( i+1 != quals.length ) out.print(","); - } else - out.print((char)(quals[i]+33)); - } - out.println(); - } - - /** - * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. - * @param read - * @return - */ - public static byte[] getBAQDeltas(SAMRecord read) { - byte[] baq = BAQ.getBAQTag(read); - if ( baq != null ) { - byte[] deltas = new byte[baq.length]; - for ( int i = 0; i < deltas.length; i++) - deltas[i] = (byte)(-1 * (baq[i] - 64)); - return deltas; - } else - return null; - } - - private int cigarLength(SAMRecord read) { - int readI = 0; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - return 0; - case H : case P : // ignore pads and hard clips - break; - case S : - case I : - readI += l; - break; - case D : break; - case M : - readI += l; - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - return readI; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer nreads) { - if ( profile ) { - out.printf("n.reads baq.per.read calculation time.in.secs%n"); - printTimer(nreads, tagTimer); - printTimer(nreads, glocalTimer); - printTimer(nreads, baqReadTimer); - } else { - out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads, - goodReads, (100.0 * goodReads) / nreads, - badReads, (100.0 * badReads) / nreads); - } - } - - private final void printTimer(int nreads, SimpleTimer timer) { - out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime()); - } -} - diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 08be4f742..f4c3163cf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.BeforeMethod; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.Utils; import java.io.File; import java.io.FileNotFoundException; -import java.util.Arrays; +import java.io.PrintStream; import java.util.List; import java.util.ArrayList; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.*; /** @@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest { System.out.println(Utils.dupString('-', 40)); System.out.println("reads : " + new String(test.readBases)); - ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false); - ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + printQuals(System.out, "in-quals:", test.quals, false); + printQuals(System.out, "bq-quals:", result.bq, false); for (int i = 0; i < test.quals.length; i++) { //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); @@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest { } } + + public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { + out.print(prefix); + for ( int i = 0; i < quals.length; i++) { + if ( asInt ) { + out.printf("%2d", (int)quals[i]); + if ( i+1 != quals.length ) out.print(","); + } else + out.print((char)(quals[i]+33)); + } + out.println(); + } } \ No newline at end of file From 64048a67e890be2c479a597daa6d48f6890af671 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 15:20:43 -0400 Subject: [PATCH 13/31] cleaning up ghost scala scripts. Deleting clearly unused one and moving others to qscripts.archive --- .../scala/BaseTransitionTableCalculator.scala | 211 ------------------ .../scala/IntervalAnnotationWalker.scala | 120 ---------- .../sting/scala/ScalaCountLoci.scala | 24 -- 3 files changed, 355 deletions(-) delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala diff --git a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala b/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala deleted file mode 100755 index 7fe249beb..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala +++ /dev/null @@ -1,211 +0,0 @@ -package org.broadinstitute.sting.scala -/** -import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall} -import java.io.File -import net.sf.samtools.SAMRecord -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext -import org.broadinstitute.sting.utils.Pair -import org.broadinstitute.sting.utils.genotype.GenotypeMetaData -import utils._ -import cmdLine.Argument - -class TransitionTable() { - var table: Array[Array[Double]] = new Array[Array[Double]](4,4) - - def makeKey(b: Char): Int = { - return BaseUtils.simpleBaseToBaseIndex(b) - } - - def fromKey(i: Int): Char = { - return BaseUtils.baseIndexToSimpleBase(i) - } - - def add(ref: Char, subst: Char) = { - table(makeKey(ref))(makeKey(subst)) += 1 - } - - def get(i: Int, j: Int): Double = { - return table(i)(j) - } - - def set(i: Int, j: Int, d: Double) = { - //printf("Setting %d / %d => %f%n", i, j, d) - table(i)(j) = d - } - - def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = { - return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j)) - } - - def header(): String = { - //return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray) - return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray) - } - - override def toString(): String = { - return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray) - } - - def sum(): Double = { - return sumList(table.toList map (x => sumList(x.toList))) - } - - def sumList(xs: List[Double]): Double = { - return (0.0 :: xs) reduceLeft {(x, y) => x + y} - } - - def getRateMatrix(): TransitionTable = { - var sums = (table.toList map (x => sumList(x.toList))).toArray - var t = new TransitionTable() - - //println(sums.toList) - this mapEntries ((i,j,x) => t.set(i, j, x / sums(i))) - - return t - } -} - -**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] { - private var MIN_MAPPING_QUALITY = 30 - private var MIN_BASE_QUALITY = 20 - private var MIN_LOD = 5 - private var PRINT_FREQ = 1000000 - private var CARE_ABOUT_STRAND = true - - private val SSG = new UnifiedGenotyper() - private val table1 = new TransitionTable() - private val tableFWD = new TransitionTable() - private val tableREV = new TransitionTable() - private val table2 = new TransitionTable() - - @Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false } - var VERBOSE: Boolean = false - - override def initialize() { - SSG.initialize(); - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = { - val callPair = SSG.map(tracker, ref, context) - //val call = callPair.getFirst().get(0) - val pileup = new ReadBackedPileup(ref.getBase(), context) - - def hasNoNs(): Boolean = { - return ! (pileup.getBases() exists ('N' ==)) - } - - if ( isDefinitelyHomRef(callPair) && hasNoNs() ) { - var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases) - - //printf("nonRefBases %s%n", nonRefBases) - if ( nonRefBases.length > 0 ) { - var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup) - var nonRefRead: SAMRecord = nonRefReads.head - - nonRefBases.length match { - case 1 => - //if ( goodRead(nonRefRead,offsets.head) ) { - //printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //} - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1) - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD ) - case 2 => - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2) - addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2) - case x => - } - - if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 ) - printNonRefBases(ref, nonRefReads, offsets, context.getReads.size()) - } - } - } - - implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*) - - def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = { - out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth) - } - - def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = { - if ( goodRead(nonRefRead, offset) ) { - //printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality) - if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) { - // it's on the reverse strand - val refComp: Char = BaseUtils.simpleComplement(ref.getBase) - val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase) - - //printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp) - table.add(refComp, nonRefComp) - } else { - table.add(ref.getBase, nonRefBase) - } - } - } - - def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = { - def getBase(i: Int): Char = { - var offset = pileup.getOffsets().get(i) - return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int]) - } - - var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i - var reads = subset map (i => pileup.getReads().get(i)) - var offsets = subset map (i => pileup.getOffsets().get(i)) - - //println(reads, offsets, subset) - return (reads, offsets) - } - - - def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = { - splitBases(refBase, pileup) match { - case (refs, nonRefs) => - //printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString) - return nonRefs - } - } - - def goodRead( read: SAMRecord, offset: Integer ): Boolean = { - return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY - } - - def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = { - val refBases = pileup.toList filter (refBase ==) - val nonRefBases = pileup.toList filter (refBase !=) - return (refBases, nonRefBases) - } - - def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = { - if ( call == null ) - return false; - - return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Unit, r: Int): Int = { - return r; - } - - override def onTraversalDone(r: Int) = { - out.println("-------------------- FINAL RESULT --------------------") - out.println("type\t" + table1.header) - def print1(t: String, x: TransitionTable) = { - out.println(t + "\t" + x) - } - - print1("1-mismatch", table1) - print1("2-mismatch", table2) - print1("1-mismatch-fwd", tableFWD) - print1("1-mismatch-rev", tableREV) - } -} -*/ \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala b/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala deleted file mode 100755 index fa30fceff..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.scala - -import java.io.PrintStream -import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext} -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import collection.JavaConversions._ -import collection.immutable.List -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature -import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker} -import org.broadinstitute.sting.commandline.{Output, Argument} -import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc} -import collection.mutable.{ListBuffer, HashSet} -import java.lang.Math - -class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] { - @Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb") - var minPropOverlap : Double = 0.50 // default to 50% - @Output var out : PrintStream = _ - - override def reduceInit : List[IntervalInfoBuilder] = { - Nil - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = { - val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList - val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList - val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList - var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase) - if ( refSeqData != null ) { - amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature]) - } - if ( tableMetaData != null ) { - amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) => - new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _))) - } - amd.newIvals = ivalsOfInterest.map(_.getLocation) - return amd - } - - override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = { - rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done))) - val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) ::: - mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) ) - newList.foreach(_.update(mapMetaData)) - return newList - } - - override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = { - finalList.foreach(u => out.print("%s%n".format(u.done))) - } - -} - -class AnnotationMetaData(loc: GenomeLoc, ref: Byte) { - val locus : GenomeLoc = loc - var newIvals : List[GenomeLoc] = Nil - var refSeqFeatures : List[RefSeqFeature] = Nil - var tableFeatures : List[(GenomeLoc,String)] = Nil - val refBase : Byte = ref -} - -class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) { - - val OVERLAP_PROP : Double = minProp - - var metaData : HashSet[String] = new HashSet[String] - var seenTranscripts : HashSet[String] = new HashSet[String] - var baseContent : ListBuffer[Byte] = new ListBuffer[Byte] - var location : GenomeLoc = loc - var gcContent : Double = _ - var entropy : Double = _ - var finalized : Boolean = false - - - def update(mmd : AnnotationMetaData) = { - baseContent += mmd.refBase - mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u)) - mmd.tableFeatures.filter( (t : (GenomeLoc,String)) => - ! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2)) - } - - def addRefSeq( rs: RefSeqFeature ) : Unit = { - seenTranscripts.add(rs.getTranscriptUniqueGeneName) - val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp ) - if ( exons.size > 0 ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _))) - } else { - if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR")) - } else { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron")) - } - } - - } - - def done : String = { - finalized = true - def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 } - gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size() - entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b))) - val meta : String = metaData.reduceLeft(_ + "\t" + _) - return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta) - } - - def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = { - if(byteList.size == 1) return 0 - Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) + - calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => { - if ( b.equals(byteList(1)) ) { - a.dropRight(1) :+ (a.last ++ b) - } else { - a :+ b - } - })) - } - -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala b/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala deleted file mode 100755 index 658aabc18..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.scala - -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext - -class ScalaCountLoci extends LocusWalker[Int,Int] { - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = { - return 1 - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Int, r: Int): Int = { - return m + r; - } - - def main(args: Array[String]) { - println("Hello, world!") - } -} From 761347b8d5b71e54bc1db01ec686efe323602c67 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 15:26:09 -0400 Subject: [PATCH 14/31] The VariantContext utility method used by SelectVariants wasn't checking the filter status (unfiltered vs. passing filters) and always returned a VC that was passing filters. This is fixed and the md5 from the VCF Streaming test has been re-updated. --- .../sting/utils/variantcontext/VariantContext.java | 2 +- .../walkers/variantutils/VCFStreamingIntegrationTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 92262f1bf..3d375aba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 4ed9718fd..cf0673ee6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("995c07ccd593fe1c35d0d28155112a55") + Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d") ); executeTest("testSimpleVCFStreaming", spec); @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") + Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" ); executeTest("testVCFStreamingChain", selectTestSpec); From 4b17f346409ff56c2192f60d311b03d69f5f6b89 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 30 Jun 2011 18:27:17 -0400 Subject: [PATCH 16/31] Fixes to the packaging system to work with the new directory structure. Also added a few R scripts to the list of files to include in the resources directory of the binary distribution, at Ryan's request. --- build.xml | 6 +++--- public/packages/AnalyzeCovariates.xml | 4 ++-- public/packages/GATKEngine.xml | 10 +++++----- public/packages/GenomeAnalysisTK.xml | 15 +++++++++------ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/build.xml b/build.xml index cf8623bed..fe1723587 100644 --- a/build.xml +++ b/build.xml @@ -64,7 +64,7 @@ - + @@ -896,7 +896,7 @@ - + @@ -934,7 +934,7 @@ - + diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml index 3a3cae54e..1862d6cbb 100644 --- a/public/packages/AnalyzeCovariates.xml +++ b/public/packages/AnalyzeCovariates.xml @@ -15,7 +15,7 @@ - - + + diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml index 1f35522d9..4f635f7fb 100644 --- a/public/packages/GATKEngine.xml +++ b/public/packages/GATKEngine.xml @@ -40,10 +40,10 @@ - - - - - + + + + + diff --git a/public/packages/GenomeAnalysisTK.xml b/public/packages/GenomeAnalysisTK.xml index 2aa58952e..f923ea698 100644 --- a/public/packages/GenomeAnalysisTK.xml +++ b/public/packages/GenomeAnalysisTK.xml @@ -13,12 +13,15 @@ - - - - - - + + + + + + + + + From 9644f104c49ed50c8894722ea971d68e274b74ca Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Jul 2011 13:13:24 -0400 Subject: [PATCH 18/31] Fixes to the queue pipeline tests to account for the new directory structure. --- .../pipeline/examples/ExampleCountLociPipelineTest.scala | 2 +- .../queue/pipeline/examples/HelloWorldPipelineTest.scala | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 2b3b51783..09f80f2c3 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -35,7 +35,7 @@ class ExampleCountLociPipelineTest { val spec = new PipelineTestSpec spec.name = "countloci" spec.args = Array( - " -S scala/qscript/examples/ExampleCountLoci.scala", + " -S public/scala/qscript/examples/ExampleCountLoci.scala", " -R " + BaseTest.hg18Reference, " -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam", " -o " + testOut).mkString diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index 8b8fa86a4..37531e791 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -32,7 +32,7 @@ class HelloWorldPipelineTest { def testHelloWorld { val spec = new PipelineTestSpec spec.name = "HelloWorld" - spec.args = "-S scala/qscript/examples/HelloWorld.scala" + spec.args = "-S public/scala/qscript/examples/HelloWorld.scala" PipelineTest.executeTest(spec) } @@ -40,7 +40,7 @@ class HelloWorldPipelineTest { def testHelloWorldWithPrefix { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPrefix" - spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPrefix HelloWorld" + spec.args = "-S public/scala/qscript/examples/HelloWorld.scala -jobPrefix HelloWorld" PipelineTest.executeTest(spec) } @@ -48,7 +48,7 @@ class HelloWorldPipelineTest { def testHelloWorldWithPriority { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" - spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPriority 100" + spec.args = "-S public/scala/qscript/examples/HelloWorld.scala -jobPriority 100" PipelineTest.executeTest(spec) } } From 11d4af0e7528d0d3dd8b5da3745ad7d7bced7280 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Jul 2011 13:41:34 -0400 Subject: [PATCH 19/31] Path-related fixes to the private queue pipeline tests. --- .../pipeline/IPFLibraryPipelineTest.scala | 71 ------------------- 1 file changed, 71 deletions(-) delete mode 100755 public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala deleted file mode 100755 index 6b32e5c9d..000000000 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/IPFLibraryPipelineTest.scala +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline - -import org.testng.annotations.Test -import org.broadinstitute.sting.BaseTest - -class IPFLibraryPipelineTest { - - @Test - def testVCFExtractSites { - var testOut = "vcfes.vcf" - val spec = new PipelineTestSpec - spec.name = "vcfExtractSites" - spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSites -ivcf %s -out %s".format( - BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut - ) - spec.fileMD5s += testOut -> "4f496b8cf90302428a9edda486a337f4" - PipelineTest.executeTest(spec) - } - - @Test - def testVCFExtractSamples { - var testOut = "vcf.extract.samples.vcf" - val spec = new PipelineTestSpec - spec.name = "vcfExtractSamples" - spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSamples -ivcf %s -out %s -sm HG00107,HG00500,NA18501,NA18942".format( - BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut - ) - - spec.fileMD5s += testOut -> "180d5a2e7a1fbc5117de6705bde3a7c8" - } - - @Test - def testVCFExtractIntervals { - var testOut = "vcf.extract.intervals.list" - val spec = new PipelineTestSpec - spec.name = "vcfExtractIntervals" - spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractIntervals -ivcf %s -out %s".format( - BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut - ) - - spec.fileMD5s += testOut ->"28589eeb28fac753577c027abf939345" - - } - - @Test - def testVCFSimpleMerge { - var testOut = "vcf.simplemerge.vcf" - val spec = new PipelineTestSpec - val int1 = BaseTest.validationDataLocation + "omni.subset.interleaved.1.vcf" - val int2 = BaseTest.validationDataLocation + "omni.subset.interleaved.2.vcf" - spec.name = "vcfSimpleMerge" - spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFSimpleMerge -vcfs %s,%s -out %s -ref %s".format( - int1,int2,testOut,BaseTest.b37KGReference - ) - - spec.fileMD5s += testOut -> "c59b8de64ba787a2ca3a1734bdebf277" - } - - @Test - def testSortByRef { - var testOut = "vcf.refsorted.vcf" - val spec = new PipelineTestSpec - val unsorted = BaseTest.validationDataLocation + "omni.pos_sorted.vcf" - spec.name = "sortByRef" - spec.args = "-S scala/qscript/oneoffs/QTools.q -T SortByRef -ivcf %s -out %s -ref %s".format( - unsorted, testOut, BaseTest.b37KGReference - ) - - spec.fileMD5s += testOut -> "ee09af803bc94987d55d044c2ebbc0b8" - } -} From d19351f71acc60673286d5dc8b1e6516bcc7a4a3 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 1 Jul 2011 16:02:28 -0400 Subject: [PATCH 20/31] Added capability of running multiple bam files in the same directory. --- .../sting/queue/qscripts/RecalibrateBaseQualities.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 678b356ee..51de3a7ad 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -41,9 +41,9 @@ class RecalibrateBaseQualities extends QScript { nContigs = getNumberOfContigs(input) - val recalFile1: File = new File("recal1.csv") - val recalFile2: File = new File("recal2.csv") - val recalBam: File = swapExt(input, ".bam", "recal.bam") + val recalFile1: File = swapExt(input, ".bam", "recal1.csv") + val recalFile2: File = swapExt(input, ".bam", "recal2.csv") + val recalBam: File = swapExt(input, ".bam", "recal.bam") val path1: String = "before" val path2: String = "after" @@ -88,4 +88,4 @@ class RecalibrateBaseQualities extends QScript { this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" } -} \ No newline at end of file +} From d647ea4fdc4274507d0456c955f2dfbf66ae70c1 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Jul 2011 16:04:30 -0400 Subject: [PATCH 21/31] Long-delayed change to CachingIndexedFastaSequenceFile. Made the cache non-static to avoid problems when multiple references are used within the same thread (eg., during integration tests). This should kill the intermittent IndelRealignerIntegrationTest failures. --- .../utils/fasta/CachingIndexedFastaSequenceFile.java | 8 ++++---- .../fasta/CachingIndexedFastaSequenceFileUnitTest.java | 5 ----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index bda52dd4e..0c5085cc7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -68,13 +68,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { ReferenceSequence seq = null; } - private static ThreadLocal cache; + private ThreadLocal cache; - static { + { resetThreadLocalCache(); - }; + } - protected static void resetThreadLocalCache() { + protected void resetThreadLocalCache() { cache = new ThreadLocal () { @Override protected Cache initialValue() { return new Cache(); diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index a60f1b0fe..74cd8d8fe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -34,11 +34,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private static final List QUERY_SIZES = Arrays.asList(1, 10, 100, 1000); private static final List CACHE_SIZES = Arrays.asList(-1, 10, 1000); - @BeforeMethod - public void clearCache() { - CachingIndexedFastaSequenceFile.resetThreadLocalCache(); - } - @DataProvider(name = "fastas") public Object[][] createData1() { List params = new ArrayList(); From b0fb63e20a86e5d3542a1c5301663eb8cc884795 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 1 Jul 2011 16:14:59 -0400 Subject: [PATCH 22/31] moving the example scala scripts to the qscripts package. --- .../sting/queue/qscripts}/examples/ExampleCountLoci.scala | 2 ++ .../sting/queue/qscripts}/examples/ExampleCountReads.scala | 2 ++ .../sting/queue/qscripts}/examples/ExampleCustomWalker.scala | 2 ++ .../queue/qscripts}/examples/ExampleUnifiedGenotyper.scala | 2 ++ .../sting/queue/qscripts}/examples/HelloWorld.scala | 2 ++ .../broadinstitute/sting/queue/qscripts}/lib/Vcf2Table.q | 2 ++ 6 files changed, 12 insertions(+) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/examples/ExampleCountLoci.scala (93%) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/examples/ExampleCountReads.scala (97%) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/examples/ExampleCustomWalker.scala (96%) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/examples/ExampleUnifiedGenotyper.scala (98%) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/examples/HelloWorld.scala (76%) rename public/scala/qscript/{ => org/broadinstitute/sting/queue/qscripts}/lib/Vcf2Table.q (95%) diff --git a/public/scala/qscript/examples/ExampleCountLoci.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala similarity index 93% rename from public/scala/qscript/examples/ExampleCountLoci.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala index ba834e72f..4ca3cbb89 100644 --- a/public/scala/qscript/examples/ExampleCountLoci.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.examples + import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/examples/ExampleCountReads.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala similarity index 97% rename from public/scala/qscript/examples/ExampleCountReads.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala index 85f796a51..9fdd1ba4c 100644 --- a/public/scala/qscript/examples/ExampleCountReads.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.examples + import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/examples/ExampleCustomWalker.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala similarity index 96% rename from public/scala/qscript/examples/ExampleCustomWalker.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala index 340934b5b..d3796d350 100644 --- a/public/scala/qscript/examples/ExampleCustomWalker.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.examples + import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala similarity index 98% rename from public/scala/qscript/examples/ExampleUnifiedGenotyper.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 2f6940d64..4a93233eb 100644 --- a/public/scala/qscript/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.examples + import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/examples/HelloWorld.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala similarity index 76% rename from public/scala/qscript/examples/HelloWorld.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala index 64e0c161c..9f2ce27fe 100644 --- a/public/scala/qscript/examples/HelloWorld.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.examples + import org.broadinstitute.sting.queue.QScript class HelloWorld extends QScript { diff --git a/public/scala/qscript/lib/Vcf2Table.q b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q similarity index 95% rename from public/scala/qscript/lib/Vcf2Table.q rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q index 1b66a648b..25ff8fddd 100755 --- a/public/scala/qscript/lib/Vcf2Table.q +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts.lib + import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, VariantsToTable} import org.broadinstitute.sting.queue.QScript From 546e7777fa1c8f6d091f75aa28e6ac490fbf3523 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Jul 2011 16:39:10 -0400 Subject: [PATCH 23/31] Re-fixing paths in pipeline tests after example qscripts got moved. --- .../pipeline/examples/ExampleCountLociPipelineTest.scala | 2 +- .../queue/pipeline/examples/HelloWorldPipelineTest.scala | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 09f80f2c3..1e6c93cff 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -35,7 +35,7 @@ class ExampleCountLociPipelineTest { val spec = new PipelineTestSpec spec.name = "countloci" spec.args = Array( - " -S public/scala/qscript/examples/ExampleCountLoci.scala", + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala", " -R " + BaseTest.hg18Reference, " -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam", " -o " + testOut).mkString diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index 37531e791..0871e769b 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -32,7 +32,7 @@ class HelloWorldPipelineTest { def testHelloWorld { val spec = new PipelineTestSpec spec.name = "HelloWorld" - spec.args = "-S public/scala/qscript/examples/HelloWorld.scala" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" PipelineTest.executeTest(spec) } @@ -40,7 +40,7 @@ class HelloWorldPipelineTest { def testHelloWorldWithPrefix { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPrefix" - spec.args = "-S public/scala/qscript/examples/HelloWorld.scala -jobPrefix HelloWorld" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld" PipelineTest.executeTest(spec) } @@ -48,7 +48,7 @@ class HelloWorldPipelineTest { def testHelloWorldWithPriority { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" - spec.args = "-S public/scala/qscript/examples/HelloWorld.scala -jobPriority 100" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100" PipelineTest.executeTest(spec) } } From 444eae316c8d33e4b3ff765aa5b7211f5bd746a9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 1 Jul 2011 17:26:25 -0400 Subject: [PATCH 25/31] Moving these supported perl scripts to public --- public/perl/liftOverVCF.pl | 83 ++++++++++++++++++++++++ public/perl/sortByRef.pl | 127 +++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100755 public/perl/liftOverVCF.pl create mode 100755 public/perl/sortByRef.pl diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl new file mode 100755 index 000000000..21cb8bb6b --- /dev/null +++ b/public/perl/liftOverVCF.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +# Runs the liftover tool on a VCF and properly handles the output + +use strict; +use Getopt::Long; + +my $in = undef; +my $gatk = undef; +my $chain = undef; +my $newRef = undef; +my $oldRef = undef; +my $out = undef; +my $tmp = "/tmp"; +my $recordOriginalLocation = 0; +GetOptions( "vcf=s" => \$in, + "gatk=s" => \$gatk, + "chain=s" => \$chain, + "newRef=s" => \$newRef, + "oldRef=s" => \$oldRef, + "out=s" => \$out, + "tmp=s" => \$tmp, + "recordOriginalLocation" => \$recordOriginalLocation); + +if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { + print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp \t\t\n\t-recordOriginalLocation \t\t\n"; + print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n"; + exit(1); +} + +# generate a random number +my $random_number = rand(); +my $tmp_prefix = "$tmp/$random_number"; +print "Writing temporary files to prefix: $tmp_prefix\n"; +my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; + +# lift over the file +print "Lifting over the vcf..."; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +if ($recordOriginalLocation) { + $cmd .= " -recordOriginalLocation"; +} +system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying."); + +# we need to sort the lifted over file now +print "\nRe-sorting the vcf...\n"; +my $sorted_vcf = "$tmp_prefix.sorted.vcf"; +open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; + +# write the header +open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; +my $inHeader = 1; +while ( $inHeader == 1 ) { + my $line = ; + if ( $line !~ m/^#/ ) { + $inHeader = 0; + } else { + print SORTED "$line"; + } +} +close(UNSORTED); +close(SORTED); + +$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/public/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf"; +system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying."); + +# Filter the VCF for bad records +print "\nFixing/removing bad records...\n"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); + +# clean up +unlink $unsorted_vcf; +unlink $sorted_vcf; +my $sorted_index = "$sorted_vcf.idx"; +unlink $sorted_index; + +print "\nDone!\n"; + +sub quit { + print "\n$_[0]\n"; + exit(1); +} diff --git a/public/perl/sortByRef.pl b/public/perl/sortByRef.pl new file mode 100755 index 000000000..71d3f4477 --- /dev/null +++ b/public/perl/sortByRef.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + + print "\nUsage:\n"; + print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n"; + + print " Sorts lines of the input file INFILE according\n"; + print " to the reference contig order specified by the\n"; + print " reference dictionary REF_DICT (.fai file).\n"; + print " The sort is stable. If -k option is not specified,\n"; + print " it is assumed that the contig name is the first\n"; + print " field in each line.\n\n"; + print " INPUT input file to sort. If '-' is specified, \n"; + print " then reads from STDIN.\n"; + print " REF_DICT .fai file, or ANY file that has contigs, in the\n"; + print " desired soting order, as its first column.\n"; + print " --k POS : contig name is in the field POS (1-based)\n"; + print " of input lines.\n\n"; + print " --tmp DIR : temp directory [default=/tmp]\n\n"; + + exit(1); +} + +my $pos = 1; +my $tmp = "/tmp"; +GetOptions( "k:i" => \$pos, + "tmp=s" => \$tmp); + +$pos--; + +usage() if ( scalar(@ARGV) == 0 ); + +if ( scalar(@ARGV) != 2 ) { + print "Wrong number of arguments\n"; + usage(); +} + +my $input_file = $ARGV[0]; +my $dict_file = $ARGV[1]; + + +open(DICT, "< $dict_file") or die("Can not open $dict_file: $!"); + +my %ref_order; + +my $n = 0; +while ( ) { + chomp; + my ($contig, $rest) = split "\t"; + die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} ); + + $ref_order{$contig} = $n; + $n++; +} + +close DICT; +#we have loaded contig ordering now + +my $INPUT; +if ($input_file eq "-" ) { + $INPUT = "STDIN"; +} else { + open($INPUT, "< $input_file") or die("Can not open $input_file: $!"); +} + +my %temp_outputs; + +while ( <$INPUT> ) { + + my @fields = split '\s'; + die("Specified field position exceeds the number of fields:\n$_") + if ( $pos >= scalar(@fields) ); + + my $contig = $fields[$pos]; + if ( $contig =~ m/:/ ) { + my @loc = split(/:/, $contig); + # print $contig . " " . $loc[0] . "\n"; + $contig = $loc[0] + } + chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line + + my $order; + if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; } + else { + $ref_order{$contig} = $n; + $order = $n; # input line has contig that was not in the dict; + $n++; # this contig will go at the end of the output, + # after all known contigs + } + + my $fhandle; + if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} } + else { + #print "opening $order $$ $_\n"; + open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or + die ( "Can not open temporary file $order: $!"); + $temp_outputs{$order} = $fhandle; + } + + # we got the handle to the temp file that keeps all + # lines with contig $contig + + print $fhandle $_; # send current line to its corresponding temp file +} + +close $INPUT; + +foreach my $f ( values %temp_outputs ) { close $f; } + +# now collect back into single output stream: + +for ( my $i = 0 ; $i < $n ; $i++ ) { + # if we did not have any lines on contig $i, then there's + # no temp file and nothing to do + next if ( ! defined $temp_outputs{$i} ) ; + + my $f; + open ( $f, "< $tmp/sortByRef.$$.$i.tmp" ); + while ( <$f> ) { print ; } + close $f; + + unlink "$tmp/sortByRef.$$.$i.tmp"; +} From 0c9105ca221508c5205f61ee23d5fe779936c513 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 1 Jul 2011 18:07:35 -0400 Subject: [PATCH 26/31] Minor fix of description --- .../org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 33069f1f5..f23433bb5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -42,5 +42,5 @@ public class LowMQ implements InfoFieldAnnotation { public List getKeyNames() { return Arrays.asList("LowMQ"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Integer, "3-tuple: ,,")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Float, "3-tuple: ,,")); } } From b6bc64a0c8e4c2dd616dd5a9aad86416857d6d09 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 1 Jul 2011 20:47:03 -0400 Subject: [PATCH 27/31] Cleanup of the utils.broad package. Using Picard IoUtils on sample names. --- .../{datasources => }/pipeline/Pipeline.java | 2 +- .../pipeline/PipelineProject.java | 2 +- .../pipeline/PipelineSample.java | 2 +- .../utils/broad/PicardAggregationUtils.java | 149 ------------------ .../utils/broad/PicardAnalysisFiles.java | 108 ------------- .../sting/utils/broad/PicardPipeline.java | 123 --------------- .../sting/utils/broad/ReferenceData.java | 135 ---------------- .../pipeline/PipelineUnitTest.java | 4 +- .../broad/PicardAggregationUtilsUnitTest.java | 61 ------- .../broad/PicardAnalysisFilesUnitTest.java | 56 ------- .../utils/broad/PicardPipelineUnitTest.java | 71 --------- .../utils/broad/ReferenceDataUnitTest.java | 49 ------ public/packages/Queue.xml | 2 +- .../sting/queue/pipeline/PipelineTest.scala | 87 ---------- 14 files changed, 7 insertions(+), 844 deletions(-) rename public/java/src/org/broadinstitute/sting/{datasources => }/pipeline/Pipeline.java (97%) rename public/java/src/org/broadinstitute/sting/{datasources => }/pipeline/PipelineProject.java (98%) rename public/java/src/org/broadinstitute/sting/{datasources => }/pipeline/PipelineSample.java (97%) delete mode 100755 public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java rename public/java/test/org/broadinstitute/sting/{datasources => }/pipeline/PipelineUnitTest.java (96%) delete mode 100755 public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java delete mode 100755 public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java delete mode 100755 public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java delete mode 100755 public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java b/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java rename to public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java index f8f8b2d29..e0e75c353 100644 --- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java +++ b/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.datasources.pipeline; +package org.broadinstitute.sting.pipeline; import java.util.ArrayList; import java.util.List; diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java rename to public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java index 7967ba5df..8d33047bf 100644 --- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java +++ b/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.datasources.pipeline; +package org.broadinstitute.sting.pipeline; import java.io.File; import java.util.Map; diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java b/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java rename to public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java index 701841302..7cd25fed5 100644 --- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java +++ b/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.datasources.pipeline; +package org.broadinstitute.sting.pipeline; import java.io.File; import java.util.Map; diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java deleted file mode 100755 index 8f3ffd741..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.broad; - -import org.apache.commons.io.filefilter.RegexFileFilter; - -import java.io.File; -import java.io.FileFilter; -import java.io.FileNotFoundException; -import java.util.Arrays; - -public class PicardAggregationUtils { - public static final String PICARD_AGGREGATION_DIR = "/seq/picard_aggregation/"; - - /** - * Returns the path to the sample BAM. - * @param project Project - * @param sample Sample - * @param version Version - * @return The path to the sample BAM. - */ - public static String getSampleBam(String project, String sample, int version) { - return getSampleDir(project, sample, version) + sample + ".bam"; - } - - /** - * Returns the path to the latest BAM. - * @param project Project - * @param sample Sample - * @return The path to the latest BAM. - * @throws FileNotFoundException If a finished directory cannot be found for a sample. - */ - public static String getSampleBam(String project, String sample) throws FileNotFoundException { - return getSampleDir(project, sample) + sample + ".bam"; - } - - /** - * Returns the sample directory. - * @param project Project - * @param sample Sample - * @param version Version - * @return the sample directory. - */ - public static String getSampleDir(String project, String sample, int version) { - return PICARD_AGGREGATION_DIR + String.format("%s/%s/v%d/", project, sample, version); - } - - /** - * Returns the latest finished directory for this project sample. - * @param project Project - * @param sample Sample - * @return The path to the latest finished directory. - * @throws FileNotFoundException If a finished directory cannot be found for a sample. - */ - public static String getSampleDir(String project, String sample) throws FileNotFoundException { - int latestVersion = getLatestVersion(project, sample); - if (latestVersion == 0) - throw new FileNotFoundException("Unable to find a finished directory for project sample " + project + "/" + sample); - return getSampleDir(project, sample, latestVersion); - } - - /** - * Returns the latest finished version directory. - * Because isilon metadata operations are relatively slow this method - * tries not to look for every possible versioned directory. - * @param project Project - * @param sample Sample - * @return The highest finished version directory or 0 if a finished directory was not found. - */ - public static int getLatestVersion(String project, String sample) { - return getLatestVersion(project, sample, 0); - } - - /** - * Returns the latest finished version directory after startVersion. - * Because isilon metadata operations are relatively slow this method - * tries not to look for every possible versioned directory. - * @param project Project - * @param sample Sample - * @param startVersion minimum version to return - * @return The highest finished version directory after startVersion - */ - public static int getLatestVersion(String project, String sample, int startVersion) { - int version = Math.max(0, startVersion); - boolean nextExists = true; - while (nextExists) { - nextExists = false; - for (int next = 3; next > 0; next--) - if (isFinished(project, sample, version + next)) { - version += next; - nextExists = true; - break; - } - } - // Special case when the version is 0 - // Because isilon storage takes a while to do meta data operations only look through every file if we have to. - if (version == 0) { - File sampleDir = new File(PICARD_AGGREGATION_DIR + project + "/" + sample); - if (sampleDir.exists()) { - FileFilter filter = new RegexFileFilter("v\\d+"); - File[] files = sampleDir.listFiles(filter); - int[] versions = new int[files.length]; - for (int i = 0; i < files.length; i++) - versions[i] = Integer.parseInt(files[i].getName().substring(1)); - Arrays.sort(versions); - for (int i = versions.length - 1; i >= 0; i--) { - if (isFinished(project, sample, versions[i])) { - version = versions[i]; - break; - } - } - } - } - return version == 0 ? startVersion : version; - } - - /** - * Returns true if the project sample directory contains a finished.txt - * @param project Project - * @param sample Sample - * @param version Version - * @return true if the project sample directory contains a finished.txt - */ - public static boolean isFinished(String project, String sample, int version) { - return new File(getSampleDir(project, sample, version), "finished.txt").exists(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java deleted file mode 100755 index 02381916e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.broad; - -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -public class PicardAnalysisFiles { - private static final String REFERENCE_SEQUENCE_HEADER = "REFERENCE_SEQUENCE"; - private static final String TARGET_INTERVALS_HEADER = "TARGET_INTERVALS"; - private static final String BAIT_INTERVALS_HEADER = "BAIT_INTERVALS"; - private static final String[] ANALYSIS_HEADERS = {REFERENCE_SEQUENCE_HEADER, TARGET_INTERVALS_HEADER, BAIT_INTERVALS_HEADER}; - private static final String ANALYSIS_FILES = "analysis_files.txt"; - - private String path; - private Map> headerValues = new HashMap>(); - - public PicardAnalysisFiles(String project, String sample) throws FileNotFoundException { - this(PicardAggregationUtils.getSampleDir(project, sample) + ANALYSIS_FILES); - } - - public PicardAnalysisFiles(String project, String sample, int version) throws FileNotFoundException { - this(PicardAggregationUtils.getSampleDir(project, sample, version) + ANALYSIS_FILES); - } - - public PicardAnalysisFiles(String path) throws FileNotFoundException { - this.path = path; - HashMap headerIndexes = null; - for (String line: new XReadLines(new File(path))) { - if (line.startsWith("#")) - continue; - String[] values = line.split("\t"); - if (headerIndexes == null) { - headerIndexes = new HashMap(); - for (String header: ANALYSIS_HEADERS) { - headerIndexes.put(header, ArrayUtils.indexOf(values, header)); - headerValues.put(header, new HashSet()); - } - } else { - for (String header: ANALYSIS_HEADERS) { - int index = headerIndexes.get(header); - if (values.length <= index) - throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line)); - String value = values[index]; - headerValues.get(header).add(value); - } - } - } - } - - public String getPath() { - return path; - } - - public String getReferenceSequence() { - return getSingle(REFERENCE_SEQUENCE_HEADER); - } - - public String getTargetIntervals() { - return getSingle(TARGET_INTERVALS_HEADER); - } - - public String getBaitIntervals() { - return getSingle(BAIT_INTERVALS_HEADER); - } - - private String getSingle(String header) { - Set values = headerValues.get(header); - if (values.size() > 1) { - throw new UnsupportedOperationException(path + " contains more than one value for " + header + ": " + values); - } else if (values.size() == 0) { - return null; - } else { - String value = values.iterator().next(); - return "null".equals(value) ? null : value; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java deleted file mode 100755 index 96bbb2455..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.broad; - -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.lang.NullArgumentException; -import org.broadinstitute.sting.datasources.pipeline.Pipeline; -import org.broadinstitute.sting.datasources.pipeline.PipelineProject; -import org.broadinstitute.sting.datasources.pipeline.PipelineSample; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.yaml.YamlUtils; - -import java.io.File; -import java.io.FileNotFoundException; - -/** - * Automatically gets the latest version using PicardAggregationUtils. - */ -public class PicardPipeline { - - protected static final String PROJECT_TAG = "SQUIDProject"; - protected static final String SAMPLE_TAG = "CollaboratorID"; - protected static final String PICARD_BAM_TYPE = "cleaned"; - - private PicardPipeline() {} - - /** - * Creates a new PicardPipeline - * @param path Path to a tsv with project [tab] sample on each line or a pipeline yaml. - * @return a new Picard - * @throws FileNotFoundException when unable to find the file or any supporting files. - */ - public static Pipeline parse(File path) throws FileNotFoundException { - if (path == null) - throw new NullArgumentException("path"); - - Pipeline pipeline; - if (path.getName().endsWith(".tsv")) { - pipeline = new Pipeline(); - pipeline.getProject().setName(FilenameUtils.getBaseName(path.getPath())); - for (String line: new XReadLines(path)) { - String[] projectSample = line.split("\t"); - addSample(pipeline, projectSample[0], projectSample[1]); - } - } else if (path.getName().endsWith(".yaml")) { - pipeline = YamlUtils.load(Pipeline.class, path); - } else { - throw new UserException.BadInput("Path does not end with .tsv or .yaml: " + path.getPath()); - } - - update(pipeline); - return pipeline; - } - - private static void update(Pipeline pipeline) throws FileNotFoundException { - for (PipelineSample sample: pipeline.getSamples()) - updateSample(pipeline.getProject(), sample); - } - - private static void addSample(Pipeline pipeline, String project, String sample) { - PipelineSample pipelineSample = new PipelineSample(); - pipelineSample.getTags().put(PROJECT_TAG, project); - pipelineSample.getTags().put(SAMPLE_TAG, sample); - pipeline.getSamples().add(pipelineSample); - } - - private static void updateSample(PipelineProject pipelineProject, PipelineSample pipelineSample) throws FileNotFoundException { - if (!pipelineSample.getTags().containsKey(PROJECT_TAG) && !pipelineSample.getTags().containsKey(SAMPLE_TAG)) - return; - String project = pipelineSample.getTags().get(PROJECT_TAG); - String sample = pipelineSample.getTags().get(SAMPLE_TAG); - int version = PicardAggregationUtils.getLatestVersion(project, sample); - if (version <= 0) - throw new UserException.BadInput("Project sample not found: " + project + "/" + sample); - String bam = PicardAggregationUtils.getSampleBam(project, sample, version); - if (pipelineSample.getId() == null) - pipelineSample.setId(project + "_" + sample); - pipelineSample.getBamFiles().put(PICARD_BAM_TYPE, new File(bam)); - - PicardAnalysisFiles analysis = new PicardAnalysisFiles(project, sample, version); - if (pipelineProject.getReferenceFile() == null) { - String referenceSequence = analysis.getReferenceSequence(); - ReferenceData referenceData = ReferenceData.getByReference(referenceSequence); - pipelineProject.setReferenceFile(new File(referenceData.getReference())); - pipelineProject.setRefseqTable(new File(referenceData.getRefseq())); - if (analysis.getTargetIntervals() != null) - pipelineProject.setIntervalList(new File(analysis.getTargetIntervals())); - pipelineProject.setEvalDbsnp(new File(referenceData.getDbsnp(129))); - if (referenceData.getDbsnpVersions().contains(132)) { - pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(132))); - } else { - pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(129))); - } - } else { - String referenceSequence = analysis.getReferenceSequence(); - if (!pipelineProject.getReferenceFile().getPath().equals(referenceSequence)) - throw new UserException.BadInput("Samples sequenced with different references"); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java b/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java deleted file mode 100755 index 36a59ec70..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.broad; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; - -/** - * Tracks data related to reference files at the Broad. - */ -public enum ReferenceData { - /** - * HG18 reference data - */ - HG18("hg18"), - - /** - * HG19 reference data - */ - HG19("hg19"); - - private static final String REFSEQ_DIR = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"; - private static final String DBSNP_DIR = "/humgen/gsa-hpprojects/GATK/data/"; - - private final String name; - private final String reference; - private final String refseq; - private final Map dbsnps; - - ReferenceData(String name) { - this.name = name; - Map dbsnps = new TreeMap(); - if ("hg18".equals(name)) { - this.reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - this.refseq = REFSEQ_DIR + "refGene-big-table-hg18.txt"; - dbsnps.put(129, DBSNP_DIR + "dbsnp_129_hg18.rod"); - } else if ("hg19".equals(name)) { - this.reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - this.refseq = REFSEQ_DIR + "refGene-big-table-hg19.txt"; - dbsnps.put(129, DBSNP_DIR + "dbsnp_129_b37.vcf"); - dbsnps.put(132, DBSNP_DIR + "dbsnp_132_b37.vcf"); - } else - throw new UnsupportedOperationException("Unknown reference: " + name); - this.dbsnps = Collections.unmodifiableMap(dbsnps); - } - - /** - * Returns the name of the reference. - * @return the name of the reference. - */ - public String getName() { - return name; - } - - /** - * Returns the path to the fasta. - * @return the path to the fasta. - */ - public String getReference() { - return reference; - } - - /** - * Returns the path to the refseq table. - * @return the path to the refseq table. - */ - public String getRefseq() { - return refseq; - } - - /** - * Returns the dbsnp versions available. - * @return the dbsnp versions available. - */ - public Set getDbsnpVersions() { - return dbsnps.keySet(); - } - - /** - * Returns the dbsnp path for the version. - * @param version version from getDbsnpVersions() - * @return the dbsnp path for the version. - */ - public String getDbsnp(int version) { - return dbsnps.get(version); - } - - /** - * Returns the dbsnp type for the version, "VCF" or "DBSNP". - * @param version version from getDbsnpVersions() - * @return the dbsnp type for the version, "VCF" or "DBSNP". - */ - public String getDbsnpType(int version) { - String dbsnp = getDbsnp(version); - if (dbsnp == null) - return null; - return dbsnp.toLowerCase().endsWith(".vcf") ? "VCF" : "DBSNP"; - } - - /** - * Returns the reference data based on the path or null. - * @param reference path to the reference - * @return the reference data based on the path or null. - */ - public static ReferenceData getByReference(String reference) { - for (ReferenceData data: ReferenceData.values()) - if (data.reference.equals(reference)) - return data; - return null; - } -} diff --git a/public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java similarity index 96% rename from public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java rename to public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java index 8e18fac6f..891356670 100644 --- a/public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java @@ -22,8 +22,10 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.datasources.pipeline; +package org.broadinstitute.sting.pipeline; +import org.broadinstitute.sting.pipeline.Pipeline; +import org.broadinstitute.sting.pipeline.PipelineSample; import org.testng.Assert; import org.broadinstitute.sting.utils.yaml.YamlUtils; diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java deleted file mode 100755 index 77a8a1ac8..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.broadinstitute.sting.utils.broad; - -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.FileNotFoundException; - -public class PicardAggregationUtilsUnitTest { - public static final String PROJECT = "C474"; - public static final String SAMPLE = "NA19651"; - public static final String MISSING_PROJECT = "C0"; - public static final String MISSING_SAMPLE = "0"; - private int latestVersion = -1; - - @Test - public void testGetLatestVersion() { - latestVersion = PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE); - System.out.println(String.format("Latest version for %s %s is %d", PROJECT, SAMPLE, latestVersion)); - Assert.assertTrue(latestVersion > 0); - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE, latestVersion), latestVersion); - } - - @Test(dependsOnMethods = "testGetLatestVersion") - public void testGetSampleBam() throws Exception { - String test = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE); - String latest = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE, latestVersion); - Assert.assertEquals(test, latest); - } - - @Test(dependsOnMethods = "testGetLatestVersion") - public void testGetSampleDir() throws Exception { - String test = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE); - String latest = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE, latestVersion); - Assert.assertEquals(test, latest); - } - - @Test(dependsOnMethods = "testGetLatestVersion") - public void testIsFinished() { - Assert.assertTrue(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion)); - Assert.assertFalse(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion + 1)); - } - - @Test(expectedExceptions = FileNotFoundException.class) - public void testMissingSampleBam() throws Exception { - PicardAggregationUtils.getSampleBam(MISSING_PROJECT, MISSING_SAMPLE); - } - - @Test(expectedExceptions = FileNotFoundException.class) - public void testMissingSampleDir() throws Exception { - PicardAggregationUtils.getSampleDir(MISSING_PROJECT, MISSING_SAMPLE); - } - - @Test - public void testLatestVersionMissing() { - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE), 0); - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, -1), -1); - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 0), 0); - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 1), 1); - Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 2), 2); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java deleted file mode 100755 index de58bf983..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.utils.broad; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.FileNotFoundException; - -import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*; - -public class PicardAnalysisFilesUnitTest extends BaseTest { - @Test - public void testParseLatest() throws Exception { - PicardAnalysisFiles files = new PicardAnalysisFiles(PROJECT, SAMPLE); - Assert.assertNotNull(files.getPath()); - files = new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE)); - Assert.assertNotNull(files.getPath()); - } - - @Test - public void testParseValid() throws Exception { - PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file.txt"); - Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"); - Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"); - Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); - } - - @Test - public void testParseValidWithComments() throws Exception { - PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt"); - Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"); - Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"); - Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); - } - - @Test(expectedExceptions = FileNotFoundException.class) - public void testParseBadPath() throws Exception { - new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt"); - } - - @Test(expectedExceptions = FileNotFoundException.class) - public void testParseMissingLatest() throws Exception { - new PicardAnalysisFiles(MISSING_PROJECT, MISSING_SAMPLE); - } - - @Test(expectedExceptions = FileNotFoundException.class) - public void testParseMissingVersion() throws Exception { - new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE) + 2); - } - - @Test(expectedExceptions = UnsupportedOperationException.class) - public void testParseMultipleReferences() throws Exception { - PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_different_references.txt"); - file.getReferenceSequence(); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java deleted file mode 100755 index 121ca4619..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.utils.broad; - -import org.testng.Assert; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.FilenameUtils; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.datasources.pipeline.Pipeline; -import org.broadinstitute.sting.datasources.pipeline.PipelineSample; -import org.broadinstitute.sting.utils.yaml.YamlUtils; -import org.testng.annotations.Test; -import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*; - -import java.io.File; -import java.io.IOException; -import java.util.Collections; - -public class PicardPipelineUnitTest { - @Test - public void testParseTsv() throws IOException { - File tsv = writeTsv(PROJECT, SAMPLE); - Pipeline pipeline = PicardPipeline.parse(tsv); - validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath())); - } - - @Test - public void testParseTsvWithPicardComments() throws Exception { - File tsv = writeTsv("C460", "HG01359"); - PicardPipeline.parse(tsv); - } - - @Test - public void testParseYaml() throws IOException { - File yaml = writeYaml("project_name", PROJECT, SAMPLE); - Pipeline pipeline = PicardPipeline.parse(yaml); - validatePipeline(pipeline, "project_name"); - } - - private void validatePipeline(Pipeline pipeline, String name) { - Assert.assertEquals(pipeline.getProject().getName(), name); - Assert.assertTrue(pipeline.getProject().getReferenceFile().exists(), "reference not found"); - Assert.assertTrue(pipeline.getProject().getIntervalList().exists(), "intervals not found"); - Assert.assertTrue(pipeline.getProject().getRefseqTable().exists(), "refseq not found"); - Assert.assertTrue(pipeline.getProject().getGenotypeDbsnp().exists(), "genotype dbsnp not found"); - Assert.assertTrue(pipeline.getProject().getEvalDbsnp().exists(), "eval dbsnp not found"); - Assert.assertEquals(pipeline.getSamples().size(), 1); - for (PipelineSample sample: pipeline.getSamples()) { - Assert.assertEquals(sample.getId(), PROJECT + "_" + SAMPLE); - Assert.assertTrue(sample.getBamFiles().get(PicardPipeline.PICARD_BAM_TYPE).exists(), "bam not found"); - Assert.assertEquals(sample.getTags().get(PicardPipeline.PROJECT_TAG), PROJECT); - Assert.assertEquals(sample.getTags().get(PicardPipeline.SAMPLE_TAG), SAMPLE); - } - } - - private File writeTsv(String project, String sample) throws IOException { - File tsv = BaseTest.createTempFile("pipeline", ".tsv"); - FileUtils.writeLines(tsv, Collections.singletonList(project + "\t" + sample)); - return tsv; - } - - private File writeYaml(String projectName, String project, String sample) throws IOException { - File yaml = BaseTest.createTempFile("pipeline", ".yaml"); - PipelineSample pipelineSample = new PipelineSample(); - pipelineSample.getTags().put(PicardPipeline.PROJECT_TAG, project); - pipelineSample.getTags().put(PicardPipeline.SAMPLE_TAG, sample); - Pipeline pipeline = new Pipeline(); - pipeline.getProject().setName(projectName); - pipeline.getSamples().add(pipelineSample); - YamlUtils.dump(pipeline, yaml); - return yaml; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java deleted file mode 100755 index e58ea3858..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.broadinstitute.sting.utils.broad; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; - -public class ReferenceDataUnitTest { - @Test - public void testNames() { - Assert.assertEquals(ReferenceData.HG18.getName(), "hg18"); - Assert.assertEquals(ReferenceData.HG19.getName(), "hg19"); - } - - @Test - public void testFilesExist() { - for (ReferenceData data: ReferenceData.values()) { - Assert.assertTrue(new File(data.getReference()).exists()); - Assert.assertTrue(new File(data.getRefseq()).exists()); - for (int version: data.getDbsnpVersions()) { - Assert.assertTrue(new File(data.getDbsnp(version)).exists()); - } - } - } - - @Test - public void testDbsnps() { - Assert.assertTrue(new File(ReferenceData.HG18.getDbsnp(129)).exists()); - Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(129)).exists()); - Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(132)).exists()); - Assert.assertNull(ReferenceData.HG19.getDbsnp(130)); - } - - @Test - public void testDbsnpTypes() { - Assert.assertEquals(ReferenceData.HG18.getDbsnpType(129), "DBSNP"); - Assert.assertEquals(ReferenceData.HG19.getDbsnpType(129), "VCF"); - Assert.assertEquals(ReferenceData.HG19.getDbsnpType(132), "VCF"); - Assert.assertNull(ReferenceData.HG19.getDbsnpType(130)); - } - - @Test - public void testGetByReference() { - Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg18Reference), ReferenceData.HG18); - Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg19Reference), ReferenceData.HG19); - Assert.assertEquals(ReferenceData.getByReference("none"), null); - } -} diff --git a/public/packages/Queue.xml b/public/packages/Queue.xml index 347b24db3..58da4398e 100644 --- a/public/packages/Queue.xml +++ b/public/packages/Queue.xml @@ -33,7 +33,7 @@ - + diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index 6b87981ff..dc3cfd9d4 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -32,8 +32,6 @@ import java.util.Date import java.text.SimpleDateFormat import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.queue.QCommandLine -import org.broadinstitute.sting.datasources.pipeline.{Pipeline, PipelineProject, PipelineSample} -import org.broadinstitute.sting.utils.broad.PicardAggregationUtils import org.broadinstitute.sting.queue.util.{Logging, ProcessController} import java.io.{FileNotFoundException, File} import org.broadinstitute.sting.gatk.report.GATKReportParser @@ -42,23 +40,6 @@ import org.broadinstitute.sting.queue.engine.CommandLinePluginManager object PipelineTest extends BaseTest with Logging { - case class K1gBam(squidId: String, sampleId: String, version: Int) - - /** 1000G BAMs used for validation */ - val k1gBams = List( - new K1gBam("C474", "NA19651", 2), - new K1gBam("C474", "NA19655", 2), - new K1gBam("C474", "NA19669", 2), - new K1gBam("C454", "NA19834", 2), - new K1gBam("C460", "HG01440", 2), - new K1gBam("C456", "NA12342", 2), - new K1gBam("C456", "NA12748", 2), - new K1gBam("C474", "NA19649", 2), - new K1gBam("C474", "NA19652", 2), - new K1gBam("C474", "NA19654", 2)) - - validateK1gBams() - private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/" val run = System.getProperty("pipeline.run") == "run" @@ -92,49 +73,6 @@ object PipelineTest extends BaseTest with Logging { */ private def tempDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "temp/" - /** - * Creates a new pipeline from a project. - * @param project Pipeline project info. - * @param samples List of samples. - * @return a new pipeline project. - */ - def createPipeline(project: PipelineProject, samples: List[PipelineSample]) = { - val pipeline = new Pipeline - pipeline.setProject(project) - pipeline.setSamples(samples) - pipeline - } - - /** - * Creates a new pipeline project for hg19 with b37 132 dbsnp for genotyping, and b37 129 dbsnp for eval. - * @param projectName Name of the project. - * @param intervals The intervals file to use. - * @return a new pipeline project. - */ - def createHg19Project(projectName: String, intervals: String) = { - val project = new PipelineProject - project.setName(projectName) - project.setReferenceFile(new File(BaseTest.hg19Reference)) - project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132)) - project.setEvalDbsnp(new File(BaseTest.b37dbSNP129)) - project.setRefseqTable(new File(BaseTest.hg19Refseq)) - project.setIntervalList(new File(intervals)) - project - } - - /** - * Creates a 1000G pipeline sample from one of the bams. - * @param idPrefix Text to prepend to the sample name. - * @param k1gBam bam to create the sample for. - * @return the created pipeline sample. - */ - def createK1gSample(idPrefix: String, k1gBam: K1gBam) = { - val sample = new PipelineSample - sample.setId(idPrefix + "_" + k1gBam.sampleId) - sample.setBamFiles(Map("cleaned" -> getPicardBam(k1gBam))) - sample - } - /** * Runs the pipelineTest. * @param pipelineTest test to run. @@ -267,31 +205,6 @@ object PipelineTest extends BaseTest with Logging { } } - /** - * Throws an exception if any of the 1000G bams do not exist and warns if they are out of date. - */ - private def validateK1gBams() { - var missingBams = List.empty[File] - for (k1gBam <- k1gBams) { - val latest = getLatestVersion(k1gBam) - val bam = getPicardBam(k1gBam) - if (k1gBam.version != latest) - logger.warn("1000G bam is not the latest version %d: %s".format(latest, k1gBam)) - if (!bam.exists) - missingBams :+= bam - } - if (missingBams.size > 0) { - val nl = "%n".format() - throw new FileNotFoundException("The following 1000G bam files are missing.%n%s".format(missingBams.mkString(nl))) - } - } - - private def getPicardBam(k1gBam: K1gBam): File = - new File(PicardAggregationUtils.getSampleBam(k1gBam.squidId, k1gBam.sampleId, k1gBam.version)) - - private def getLatestVersion(k1gBam: K1gBam): Int = - PicardAggregationUtils.getLatestVersion(k1gBam.squidId, k1gBam.sampleId, k1gBam.version) - private var runningCommandLines = Set.empty[QCommandLine] Runtime.getRuntime.addShutdownHook(new Thread { From 17ff5bb09436b8805657859b787f4f7f8714cc08 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 2 Jul 2011 09:55:35 -0400 Subject: [PATCH 28/31] Variant records coming out of the VQSR are now annotated with which input annotation was most divergent from the Gaussian mixture model. This gives a general sense for why each variant was removed from the callset. --- .../ApplyRecalibration.java | 8 +++++++- .../GaussianMixtureModel.java | 13 +++++++++++++ .../VariantDataManager.java | 4 +++- .../variantrecalibration/VariantDatum.java | 1 + .../VariantRecalibrator.java | 2 ++ .../VariantRecalibratorEngine.java | 17 +++++++++++++++++ ...iantRecalibrationWalkersIntegrationTest.java | 4 ++-- 7 files changed, 45 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 9877781d1..02d850211 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -55,7 +55,6 @@ import java.util.*; public class ApplyRecalibration extends RodWalker { - ///////////////////////////// // Inputs ///////////////////////////// @@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker { final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); final private NestedHashMap lodMap = new NestedHashMap(); + final private NestedHashMap annotationMap = new NestedHashMap(); final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- @@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker { final Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); final TreeSet samples = new TreeSet(); samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)); @@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker { for ( final String line : new XReadLines( RECAL_FILE ) ) { final String[] vals = line.split(","); lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys + annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys } } catch ( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); @@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker { String filterString = null; final Map attrs = new HashMap(vc.getAttributes()); final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); + final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); if( lod == null ) { throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + // Annotate the new record with its VQSLOD and the worst performing annotation attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); + attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); if( lod >= tranche.minVQSLod ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 9ffe7be7a..a09a30145 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -190,6 +190,19 @@ public class GaussianMixtureModel { return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) } + public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) { + if(datum.isNull[iii]) { return null; } + + final Normal normal = new Normal(0.0, 1.0, null); + final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + int gaussianIndex = 0; + for( final MultivariateGaussian gaussian : gaussians ) { + normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) ); + pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) ); + } + return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + } + public double evaluateDatumMarginalized( final VariantDatum datum ) { int numVals = 0; double sumPVarInGaussian = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 2fd1326fe..efff5b780 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -254,7 +254,9 @@ public class VariantDataManager { public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { - RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod)); + RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", + datum.contig, datum.start, datum.stop, datum.lod, + (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index ac875b645..38e2affb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -24,6 +24,7 @@ public class VariantDatum implements Comparable { public String contig; public int start; public int stop; + public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation public int compareTo( final VariantDatum other ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index e651b62e0..b5fda4443 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -60,6 +60,7 @@ import java.util.*; public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { public static final String VQS_LOD_KEY = "VQSLOD"; + public static final String CULPRIT_KEY = "culprit"; @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); @@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker randomData = dataManager.getRandomDataForPlotting( 6000 ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index a0fbc572d..81e4d190c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -57,6 +57,23 @@ public class VariantRecalibratorEngine { } } + public void calculateWorstPerformingAnnotation( final List data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) { + for( final VariantDatum datum : data ) { + int worstAnnotation = -1; + double minProb = Double.MAX_VALUE; + for( int iii = 0; iii < datum.annotations.length; iii++ ) { + final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii); + final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii); + if( goodProbLog10 != null && badProbLog10 != null ) { + final double prob = goodProbLog10 - badProbLog10; + if(prob < minProb) { minProb = prob; worstAnnotation = iii; } + } + } + datum.worstAnnotation = worstAnnotation; + } + } + + ///////////////////////////// // Private Methods used for generating a GaussianMixtureModel ///////////////////////////// diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index eb6a1a4c6..9600046da 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "d33212a84368e821cbedecd4f59756d6", // tranches - "a35cd067f378442eee8cd5edeea92be0", // recal file - "126d52843f4a57199ee97750ffc16a07"); // cut VCF + "4652dca41222bebdf9d9fda343b2a835", // recal file + "5350b1a4c1250cf3b77ca45327c04711"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { From 5faf40b79deea87ceb0e682bf6e27721cc51f621 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 2 Jul 2011 10:39:53 -0400 Subject: [PATCH 30/31] Moving AnalyzeAnnotations into the archive because it has outlived its usefulness. --- .../R/plot_Annotations_BinnedTruthMetrics.R | 190 ------------------ .../AnalyzeAnnotationsWalker.java | 154 -------------- .../AnnotationDataManager.java | 169 ---------------- .../analyzeannotations/AnnotationDatum.java | 170 ---------------- public/packages/GenomeAnalysisTK.xml | 1 - 5 files changed, 684 deletions(-) delete mode 100644 public/R/plot_Annotations_BinnedTruthMetrics.R delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R deleted file mode 100644 index 9f9ee290c..000000000 --- a/public/R/plot_Annotations_BinnedTruthMetrics.R +++ /dev/null @@ -1,190 +0,0 @@ -#!/bin/env Rscript - -args <- commandArgs(TRUE) -verbose = TRUE - -input = args[1] -annotationName = args[2] -minBinCutoff = as.numeric(args[3]) -medianNumVariants = args[4] - -c <- read.table(input, header=T) - -all = c[c$numVariants>minBinCutoff & c$category=="all",] -novel = c[c$numVariants>minBinCutoff & c$category=="novel",] -dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] -truth = c[c$numVariants>minBinCutoff & c$category=="truth",] - -# -# Calculate min, max, medians -# - -d = c[c$numVariants>minBinCutoff,] -ymin = min(d$titv) -ymax = max(d$titv) -xmin = min(d$value) -xmax = max(d$value) -m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) -ma = all[all$value > m,] -mb = all[all$value < m,] -m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) -m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) -if(medianNumVariants == "true") { -vc = cumsum( all$numVariants/sum(all$numVariants) ) -m10 = all$value[ max(which(vc<=0.10)) ] -m25 = all$value[ max(which(vc<=0.25)) ] -m = all$value[ max(which(vc<=0.5)) ] -m75 = all$value[ min(which(vc>=0.75)) ] -m90 = all$value[ min(which(vc>=0.90)) ] -} - -# -# Plot TiTv ratio as a function of the annotation -# - -outfile = paste(input, ".TiTv.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -points(novel$value,novel$titv,col="green",pch=20) -points(dbsnp$value,dbsnp$titv,col="blue",pch=20) -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(truth$value,truth$titv,col="magenta",pch=20) -legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) -} else { -legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) -} -dev.off() - -# -# Plot TiTv ratio as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".TiTv_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -points(novel$value,novel$titv,col="green",pch=20) -points(dbsnp$value,dbsnp$titv,col="blue",pch=20) -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(truth$value,truth$titv,col="magenta",pch=20) -legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) -} else { -legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) -} -dev.off() - -# -# Plot dbsnp and true positive rate as a function of the annotation -# - -ymin = min(all$dbsnp) -ymax = max(all$dbsnp) -outfile = paste(input, ".truthRate.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -yLabel = "DBsnp Rate" -if( sum(all$truePositive==0) != length(all$truePositive) ) { -t = all[all$truePositive>0,] -yLabel = "DBsnp/True Positive Rate" -ymin = min(min(all$dbsnp),min(t$truePositive)) -ymax = max(max(all$dbsnp),max(t$truePositive)) -} -plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(t$value,t$truePositive,col="magenta",pch=20); -legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) -} -dev.off() - -# -# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".truthRate_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -yLabel = "DBsnp Rate" -if( sum(all$truePositive==0) != length(all$truePositive) ) { -yLabel = "DBsnp/Truth Rate" -} -plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -abline(v=m,lty=2,col="red") -abline(v=m75,lty=3) -abline(v=m25,lty=3) -text(m, ymin, "50", col="red", cex=0.6); -text(m75, ymin, "75", col="black", cex=0.6); -text(m25, ymin, "25", col="black", cex=0.6); -if(medianNumVariants == "true") { -abline(v=m90,lty=3) -abline(v=m10,lty=3) -text(m10, ymin, "10", col="black", cex=0.6); -text(m90, ymin, "90", col="black", cex=0.6); -} -if( sum(all$truePositive==0) != length(all$truePositive) ) { -points(t$value,t$truePositive,col="magenta",pch=20); -legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) -} -dev.off() - -# -# Plot histogram of the annotation's value -# - -outfile = paste(input, ".Histogram.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -dev.off() - -# -# Plot histogram of the annotation's value, log scale on x-axis -# - -outfile = paste(input, ".Histogram_log.pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -dev.off() diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java deleted file mode 100755 index 54fd0a20c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.analyzeannotations; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.commandline.Argument; - -import java.util.HashMap; -import java.util.Collection; - -/** - * Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field. - * - * @author rpoplin - * @since Jan 15, 2010 - * @help.summary Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field. - */ - -public class AnalyzeAnnotationsWalker extends RodWalker { - - ///////////////////////////// - // Command Line Arguments - ///////////////////////////// - @Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false) - private String OUTPUT_PREFIX = "analyzeAnnotations/"; - @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "Rscript"; - @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - private String PATH_TO_RESOURCES = "R/"; - @Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false) - private int MIN_VARIANTS_PER_BIN = 1000; - @Argument(fullName = "max_variants_per_bin", shortName = "maxBinSize", doc = "The maximum number of variants in a bin.", required = false) - private int MAX_VARIANTS_PER_BIN = 20000; - @Argument(fullName = "sampleName", shortName = "sampleName", doc = "If supplied, only process variants found in this sample.", required = false) - private String SAMPLE_NAME = null; - @Argument(fullName = "name", shortName = "name", doc = "Labels for the annotations to make plots look nicer. Each name is a separate -name argument. For example, -name DP,Depth -name AB,AlleleBalance", required = false) - private String[] ANNOTATION_NAMES = null; - @Argument(fullName = "indicate_mean_num_vars", shortName = "meanNumVars", doc = "If supplied, plots will indicate the distribution of number of variants instead of distribution of value of annotation", required = false) - private boolean INDICATE_MEAN_NUM_VARS = false; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private AnnotationDataManager dataManager; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - - // Create a HashMap associating the names of the annotations to full Strings that can be used as labels on plots - HashMap nameMap = null; - if( ANNOTATION_NAMES != null ) { - nameMap = new HashMap(); - for( String nameLine : ANNOTATION_NAMES ) { - String[] vals = nameLine.split(","); - nameMap.put(vals[0],vals[1]); - } - } - dataManager = new AnnotationDataManager( nameMap, INDICATE_MEAN_NUM_VARS ); - - if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - if( tracker != null ) { - Collection VCs = tracker.getAllVariantContexts(ref); - - // First find out if this variant is in the truth sets - boolean isInTruthSet = false; - boolean isTrueVariant = false; - for ( VariantContext vc : VCs ) { - if( vc != null && vc.isSNP() && !vc.isFiltered() ) { - if( vc.getSource().toUpperCase().startsWith("TRUTH") ) { - isInTruthSet = true; - if( vc.isBiallelic() && vc.isVariant() ) { - isTrueVariant = true; - } - } - } - } - - // Add each annotation in this VCF Record to the dataManager - for ( VariantContext vc : VCs ) { - if( vc != null && vc.isSNP() && vc.isBiallelic() && !vc.isFiltered() ) { - if( !vc.getSource().toUpperCase().startsWith("TRUTH") ) { - if( vc.isVariant() ) { - dataManager.addAnnotations( vc, SAMPLE_NAME, isInTruthSet, isTrueVariant ); - } - } - } - } - } - - return 1; // This value isn't actually used for anything - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - public Integer reduceInit() { - return 0; // Nothing to do here - } - - public Integer reduce( Integer value, Integer sum ) { - return 0; // Nothing to do here - } - - public void onTraversalDone( Integer sum ) { - - // For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables - dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_PREFIX, MIN_VARIANTS_PER_BIN, MAX_VARIANTS_PER_BIN); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java deleted file mode 100755 index 2c06b30d6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java +++ /dev/null @@ -1,169 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.analyzeannotations; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.util.*; -import java.io.IOException; -import java.io.PrintStream; -import java.io.FileNotFoundException; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Jan 18, 2010 - */ - -public class AnnotationDataManager { - - private final HashMap> data; - private final HashMap nameMap; - private final boolean INDICATE_MEAN_NUM_VARS; - - public AnnotationDataManager( HashMap _nameMap, boolean _INDICATE_MEAN_NUM_VARS ) { - data = new HashMap>(); - nameMap = _nameMap; - INDICATE_MEAN_NUM_VARS = _INDICATE_MEAN_NUM_VARS; - } - - public void addAnnotations( final VariantContext vc, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) { - - if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName - if( vc.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out - return; - } - } // else, process all samples - - // Loop over each annotation in the vcf record - final Map infoField = new HashMap(vc.getAttributes()); - infoField.put("QUAL", ((Double)vc.getPhredScaledQual()).toString() ); // add QUAL field to annotations - for( Map.Entry annotation : infoField.entrySet() ) { - - float value; - try { - value = Float.parseFloat( annotation.getValue().toString() ); - } catch( NumberFormatException e ) { - continue; // Skip over annotations that aren't floats, like "DB" - } - - TreeSet treeSet = data.get( annotation.getKey() ); - if( treeSet == null ) { // This annotation hasn't been seen before - treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation - data.put( annotation.getKey(), treeSet ); - } - AnnotationDatum datum = new AnnotationDatum( value ); - if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present - datum = treeSet.tailSet(datum).first(); - } else { - treeSet.add(datum); - } - - final boolean isNovelVariant = !vc.hasID() || !vc.getID().contains("rs"); - - // Decide if the variant is a transition or transversion - if( VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0 ) { - datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant ); - } else { - datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant ); - } - } - } - - public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_PREFIX, - final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) { - - final AnnotationDatum thisAnnotationBin = new AnnotationDatum(); - System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" ); - - // For each annotation we've seen - for( final String annotationKey : data.keySet() ) { - - String filename = OUTPUT_PREFIX + annotationKey + ".dat"; - PrintStream output; - try { - output = new PrintStream(filename); // Create the intermediate data file for this annotation - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(new File(filename), "Can't create intermediate output annotation data file. Does the output directory exist?", e); - } - - // Output a header line - output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory"); - - // Bin SNPs and calculate truth metrics for each bin - thisAnnotationBin.clearBin(); - for( final AnnotationDatum datum : data.get( annotationKey ) ) { - thisAnnotationBin.combine( datum ); - if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + - "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); - thisAnnotationBin.clearBin(); - } - // else, continue accumulating variants because this bin isn't full yet - } - - // One final bin that may not have been dumped out - if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + - "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); - thisAnnotationBin.clearBin(); - } - - // Close the PrintStream - output.close(); - - String annotationName = null; - if( nameMap != null ) { - annotationName = nameMap.get(annotationKey); - } - if( annotationName == null ) { // This annotation is not in the name map so use the key instead - annotationName = annotationKey; - } - - // Print out the command line to make it clear to the user what is being executed and how one might modify it - final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " + - OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN + " " + INDICATE_MEAN_NUM_VARS; - System.out.println( rScriptCommandLine ); - - // Execute the RScript command to plot the table of truth values - try { - Runtime.getRuntime().exec( rScriptCommandLine ); - } catch ( IOException e ) { - throw new UserException.CannotExecuteRScript( rScriptCommandLine, e ); - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java deleted file mode 100755 index 888eb7e6e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java +++ /dev/null @@ -1,170 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.analyzeannotations; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Comparator; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Jan 18, 2010 - */ - -public class AnnotationDatum implements Comparator { - - public float value; - private final int[] ti; - private final int[] tv; - - public static final int FULL_SET = 0; - public static final int NOVEL_SET = 1; - public static final int DBSNP_SET = 2; - public static final int TRUTH_SET = 3; - public static final int TRUE_POSITIVE = 4; - private static final int NUM_SETS = 5; - - public AnnotationDatum() { - - value = 0.0f; - ti = new int[NUM_SETS]; - tv = new int[NUM_SETS]; - for( int iii = 0; iii < NUM_SETS; iii++ ) { - ti[iii] = 0; - tv[iii] = 0; - } - } - - public AnnotationDatum( final float _value ) { - - value = _value; - ti = new int[NUM_SETS]; - tv = new int[NUM_SETS]; - for( int iii = 0; iii < NUM_SETS; iii++ ) { - ti[iii] = 0; - tv[iii] = 0; - } - } - - final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) { - - ti[FULL_SET]++; - if( isNovelVariant ) { - ti[NOVEL_SET]++; - } else { // Is known, in DBsnp - ti[DBSNP_SET]++; - } - if( isInTruthSet ) { - ti[TRUTH_SET]++; - if( isTrueVariant ) { - ti[TRUE_POSITIVE]++; - } - } - } - - final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) { - - tv[FULL_SET]++; - if( isNovelVariant ) { - tv[NOVEL_SET]++; - } else { // Is known, in DBsnp - tv[DBSNP_SET]++; - } - if( isInTruthSet ) { - tv[TRUTH_SET]++; - if( isTrueVariant ) { - tv[TRUE_POSITIVE]++; - } - } - } - - final public void combine( final AnnotationDatum that ) { - - for( int iii = 0; iii < NUM_SETS; iii++ ) { - this.ti[iii] += that.ti[iii]; - this.tv[iii] += that.tv[iii]; - } - this.value = that.value; // Overwrite this bin's value - } - - final public float calcTiTv( final int INDEX ) { - - if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { - throw new ReviewedStingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); - } - - if( tv[INDEX] == 0 ) { // Don't divide by zero - return 0.0f; - } - - return ((float) ti[INDEX]) / ((float) tv[INDEX]); - } - - final public float calcDBsnpRate() { - - if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero - return 0.0f; - } - - return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) / - ((float) ti[FULL_SET] + tv[FULL_SET]); - } - - final public float calcTPrate() { - - if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero - return 0.0f; - } - - return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) / - ((float) ti[TRUTH_SET] + tv[TRUTH_SET]); - } - - final public int numVariants( final int INDEX ) { - return ti[INDEX] + tv[INDEX]; - } - - final public void clearBin() { - value = 0.0f; - for( int iii = 0; iii < NUM_SETS; iii++ ) { - ti[iii] = 0; - tv[iii] = 0; - } - } - - public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator - if( a1.value < a2.value ) { return -1; } - if( a1.value > a2.value ) { return 1; } - return 0; - } - - public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet - if( this.value < that.value ) { return -1; } - if( this.value > that.value ) { return 1; } - return 0; - } -} diff --git a/public/packages/GenomeAnalysisTK.xml b/public/packages/GenomeAnalysisTK.xml index f923ea698..14b837211 100644 --- a/public/packages/GenomeAnalysisTK.xml +++ b/public/packages/GenomeAnalysisTK.xml @@ -19,7 +19,6 @@ -