diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java index 16dac6fbe..8153d9b11 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java @@ -64,4 +64,26 @@ public class VariantCallContext extends VariantContext { public void setRefBase(byte ref) { this.refBase = ref; } + + /* these methods are only implemented for GENOTYPE_GIVEN_ALLELES MODE */ + //todo -- expand these methods to all modes + + /** + * + * @param callConfidenceThreshold the Unified Argument Collection STANDARD_CONFIDENCE_FOR_CALLING + * @return true if call was confidently ref + */ + public boolean isCalledRef(double callConfidenceThreshold) { + return (confidentlyCalled && (getPhredScaledQual() < callConfidenceThreshold)); + } + + /** + * + * @param callConfidenceThreshold the Unified Argument Collection STANDARD_CONFIDENCE_FOR_CALLING + * @return true if call was confidently alt + */ + public boolean isCalledAlt(double callConfidenceThreshold) { + return (confidentlyCalled && (getPhredScaledQual() > callConfidenceThreshold)); + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java index e2a0e4847..145228f71 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/GenotypeAndValidateWalker.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.playground.gatk.walkers; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.MutableVariantContext; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeader; @@ -66,11 +68,14 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; @Reference(window=@Window(start=-200,stop=200)) -public class GenotypeAndValidateWalker extends RodWalker { +public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible { - @Output(doc="File to which validated variants should be written", required=true) + @Output(doc="File to which validated variants should be written", required=false) protected VCFWriter vcfWriter = null; + @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false) + private boolean bamIsTruth = false; + @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) private int mbq = -1; @@ -86,32 +91,35 @@ public class GenotypeAndValidateWalker extends RodWalker header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), compName); - Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); - headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); - + if (vcfWriter != null) { + Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), compName); + Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); + headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + } // Filling in SNP calling arguments for UG UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; - if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; - + if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); // Adding the INDEL calling arguments for UG uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.DINDEL; indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // make sure we have callConf set to the threshold set by the UAC so we can use it later. + callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; } //--------------------------------------------------------------------------------------------------------------- @@ -181,47 +192,74 @@ public class GenotypeAndValidateWalker extends RodWalker 0 && context.getBasePileup().getBases().length < minDepth)) { - counter.numUncovered = 1L; + counter.nUncovered = 1L; return counter; } - if (!vcComp.hasAttribute("GV")) - throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); - VariantCallContext call; if ( vcComp.isSNP() ) call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); else if ( vcComp.isIndel() ) { call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); -// if (call.vc == null) // variant context will be null on an extended indel event and I just want to call it one event. -// return counter; } else { logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); return counter; } - if (!call.confidentlyCalled) { - counter.numNotConfidentCalls = 1L; - if (vcComp.getAttribute("GV").equals("T")) - counter.numFN = 1L; - else - counter.numTN = 1L; + if (bamIsTruth) { + if (call.confidentlyCalled) { + // If truth is a confident REF call + if (call.isVariant()) { + if (vcComp.isVariant()) + counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same? + else + counter.nAltCalledRef = 1L; + } + // If truth is a confident ALT call + else { + if (vcComp.isVariant()) + counter.nRefCalledAlt = 1L; + else + counter.nRefCalledRef = 1L; + } + } + else { + counter.nNotConfidentCalls = 1L; + } } else { - counter.numConfidentCalls = 1L; - if (vcComp.getAttribute("GV").equals("T")) - counter.numTP = 1L; + if (!vcComp.hasAttribute("GV")) + throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); + + + + if (call.isCalledAlt(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledAlt = 1L; + else + counter.nRefCalledAlt = 1L; + } + else if (call.isCalledRef(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledRef = 1L; + else + counter.nRefCalledRef = 1L; + } + else { + counter.nNotConfidentCalls = 1L; + } + } + + if (vcfWriter != null) { + if (!vcComp.hasAttribute("callStatus")) { + MutableVariantContext mvc = new MutableVariantContext(vcComp); + mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); + vcfWriter.add(mvc, ref.getBase()); + } else - counter.numFP = 1L; + vcfWriter.add(vcComp, ref.getBase()); } - if (!vcComp.hasAttribute("callStatus")) { - MutableVariantContext mvc = new MutableVariantContext(vcComp); - mvc.putAttribute("callStatus", call.confidentlyCalled ? "confident" : "notConfident" ); - vcfWriter.add(mvc, ref.getBase()); - } - else - vcfWriter.add(vcComp, ref.getBase()); return counter; } @@ -235,17 +273,22 @@ public class GenotypeAndValidateWalker extends RodWalker " + file) - val globalIntervals = new File(outName + ".intervals") + // If this is a 'knowns only' indel realignment run, do it only once for all samples. + val globalIntervals = new File(outputDir + projectName + ".intervals") if (knownsOnly) add(target(null, globalIntervals)) + // Put each sample through the pipeline for ((sample, bam) <- sampleBamFiles) { // BAM files generated by the pipeline @@ -124,7 +160,13 @@ class dataProcessingV2 extends QScript { cov(recalBam, postRecalFile), analyzeCovariates(preRecalFile, preOutPath), analyzeCovariates(postRecalFile, postOutPath)) + + cohortList :+= recalBam } + + // output a BAM list with all the processed per sample files + val cohortFile = new File(qscript.outputDir + qscript.projectName + ".cohort.list") + add(writeList(cohortList, cohortFile)) } // General arguments to all programs @@ -142,9 +184,9 @@ class dataProcessingV2 extends QScript { override def inputBams = join override def outputBam = joined override def commandLine = super.commandLine + " CREATE_INDEX=true" - this.memoryLimit = Some(6) this.jarFile = qscript.mergeBamJar this.isIntermediate = true + this.analysisName = queueLogDir + outBam + ".joinBams" this.jobName = queueLogDir + outBam + ".joinBams" } @@ -155,6 +197,8 @@ class dataProcessingV2 extends QScript { this.mismatchFraction = Some(0.0) this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) this.rodBind :+= RodBind("indels", "VCF", indels) + this.scatterCount = nContigs + this.analysisName = queueLogDir + outIntervals + ".target" this.jobName = queueLogDir + outIntervals + ".target" } @@ -168,6 +212,8 @@ class dataProcessingV2 extends QScript { this.doNotUseSW = useSW this.compress = Some(0) this.U = Some(org.broadinstitute.sting.gatk.arguments.ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION) // todo -- update this with the last consensus between Tim, Matt and Eric. This is ugly! + this.scatterCount = nContigs + this.analysisName = queueLogDir + outBam + ".clean" this.jobName = queueLogDir + outBam + ".clean" } @@ -182,15 +228,17 @@ class dataProcessingV2 extends QScript { sortOrder = null this.memoryLimit = Some(6) this.jarFile = qscript.dedupJar - this.isIntermediate = true + this.analysisName = queueLogDir + outBam + ".dedup" this.jobName = queueLogDir + outBam + ".dedup" } + //todo -- add scatter gather capability (waiting for khalid's modifications to the queue base case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile + this.analysisName = queueLogDir + outRecalFile + ".covariates" this.jobName = queueLogDir + outRecalFile + ".covariates" } @@ -204,7 +252,9 @@ class dataProcessingV2 extends QScript { else if (qscript.intervals != null) this.intervals :+= qscript.intervals this.U = Some(org.broadinstitute.sting.gatk.arguments.ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION) // todo -- update this with the last consensus between Tim, Matt and Eric. This is ugly! this.index_output_bam_on_the_fly = Some(true) + this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" + } case class analyzeCovariates (inRecalFile: File, outPath: File) extends AnalyzeCovariates { @@ -212,12 +262,16 @@ class dataProcessingV2 extends QScript { this.resources = qscript.R this.recal_file = inRecalFile this.output_dir = outPath.toString + this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" } case class writeList(inBams: List[File], outBamList: File) extends ListWriterFunction { this.inputFiles = inBams this.listFile = outBamList + this.analysisName = queueLogDir + outBamList + ".bamList" this.jobName = queueLogDir + outBamList + ".bamList" } + + }