diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java index a6485ccee..6d4c9ded5 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/MultiSampleConsensusReadCompressor.java @@ -49,11 +49,11 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress final int readContextSize, final GenomeLocParser glParser, final int minBpForRunningConsensus, - final int maxReadsAtVariableSites) { + final int targetDepthAtVariableSites) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleConsensusReadCompressor(name, readContextSize, - glParser, minBpForRunningConsensus, maxReadsAtVariableSites)); + glParser, minBpForRunningConsensus, targetDepthAtVariableSites)); // todo -- argument for minConsensusSize } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java index a0d655528..c40430594 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.reducereads; import net.sf.samtools.*; +import org.apache.commons.math.stat.descriptive.summary.Sum; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; @@ -67,7 +68,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres Queue waitingReads = new LinkedList(); final int readContextSize; - final int maxReadsAtVariableSites; + final int targetDepthAtVariableSites; final int minBpForRunningConsensus; int retryTimer = 0; int consensusCounter = 0; @@ -82,11 +83,11 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres final int readContextSize, final GenomeLocParser glParser, final int minBpForRunningConsensus, - final int maxReadsAtVariableSites) { + final int targetDepthAtVariableSites) { this.readContextSize = readContextSize; this.glParser = glParser; this.minBpForRunningConsensus = minBpForRunningConsensus; - this.maxReadsAtVariableSites = maxReadsAtVariableSites; + this.targetDepthAtVariableSites = targetDepthAtVariableSites; this.reducedReadGroup = createReducedReadGroup(sampleName); } @@ -326,25 +327,54 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres if ( span.isConserved() ) reads.addAll(conservedSpanReads(sites, span)); else - reads.addAll(downsample(variableSpanReads(sites, span))); + reads.addAll(downsample(variableSpanReads(sites, span), span)); } return reads; } - // todo -- should be smart -- should take reads in some priority order - // todo -- by length, and by strand, ideally. Perhaps alternating by strand - // todo -- in order of length? - private Collection downsample(Collection reads) { - if ( reads.size() > maxReadsAtVariableSites ) { - List readArray = new ArrayList(reads); - Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator()); - return readArray.subList(0, maxReadsAtVariableSites); - } else { + /** + * Downsamples the reads until we have 2x the ideal target depth in the span. + * + * todo: perhaps it would be better to smooth coverage, so that the probability of + * todo: retaining a read would be proportional to the over-coverage of each site + * + * @param reads + * @param span + * @return + */ + private Collection downsample(Collection reads, ConsensusSpan span) { + // ideally, we would have exactly span bp at target depth, x2 for the directionality of reads + int idealBPinSpan = span.size() * targetDepthAtVariableSites * 2; + int rawBPinSpan = readsBP(reads); + + // The chance we want to keep a particular bp is ideal / actual + double pKeepPerBP = (1.0 * idealBPinSpan) / rawBPinSpan; + + if ( pKeepPerBP >= 1.0 ) { // not enough coverage return reads; + } else { // we don'need to downsample + List downsampled = new ArrayList(); + for ( SAMRecord read : reads ) { + // should this be proportional to read length? + double pKeep = pKeepPerBP; // * read.getReadLength(); + if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < pKeep ) { + downsampled.add(read); + } + } + + logger.info(String.format("targetDepth=%d, idealBP=%d, rawBP=%d, pKeepPerBP=%.2e, nRawReads=%d, nKeptReads=%d, keptBP=%d", + targetDepthAtVariableSites, idealBPinSpan, rawBPinSpan, pKeepPerBP, reads.size(), downsampled.size(), readsBP(downsampled))); + return downsampled; } } + private static final int readsBP(Collection reads) { + int sum = 0; + for ( SAMRecord read : reads ) sum += read.getReadLength(); + return sum; + } + private List conservedSpanReads(List sites, ConsensusSpan span) { byte[] bases = new byte[span.size()]; byte[] quals = new byte[span.size()]; diff --git a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala index 0a9fa300e..41e5f8334 100755 --- a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala +++ b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala @@ -43,14 +43,26 @@ class ReducedBAMEvaluation extends QScript { def script = { val reducedBAM = new ReduceBAM(bam) add(reducedBAM) - callAndEvaluateBAM(reducedBAM.out) + val reducedBAMVCF = callAndEvaluateBAM(reducedBAM.out) val slicedBAM = new SliceBAM(bam) add(slicedBAM) - callAndEvaluateBAM(slicedBAM.out) + val fullBAMVCF = callAndEvaluateBAM(slicedBAM.out) + + val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS + combineCalls.rodBind :+= RodBind("fullBAM", "VCF", fullBAMVCF) + combineCalls.rodBind :+= RodBind("reducedBAM", "VCF", reducedBAMVCF) + combineCalls.rod_priority_list = "reducedBAM,fullBAM" + combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED + combineCalls.out = swapExt(reducedBAM.out,".bam",".filtered.combined.vcf") + add(combineCalls) + val eval = new Eval(combineCalls.out) // evaluate the combined VCF + eval.select = List("'set==\"Intersection\"'", "'set==\"fullBAM\"'", "'set==\"reducedBAM\"'", "'set==\"filterInreducedBAM-fullBAM\"'", "'set==\"reducedBAM-filterInfullBAM\"'") + eval.selectName = List("Intersection", "fullBAM", "reducedBAM", "filterInreducedBAM-fullBAM", "reducedBAM-filterInfullBAM") + add(eval) } - def callAndEvaluateBAM(bam: File) = { + def callAndEvaluateBAM(bam: File): File = { val rawVCF = new Call(bam) add(rawVCF) @@ -63,22 +75,30 @@ class ReducedBAMEvaluation extends QScript { filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf") add(filterSNPs) - val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS - targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out) - if ( dbSNP.exists() ) - targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - targetEval.doNotUseAllStandardStratifications = true - targetEval.doNotUseAllStandardModules = true - targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter") - targetEval.out = swapExt(filterSNPs.out,".vcf",".eval") - add(targetEval) + // create a variant eval for us + add(new Eval(filterSNPs.out)) // for convenient diffing add(new DiffableTable(rawVCF.out)) add(new DiffableTable(filterSNPs.out)) + + return filterSNPs.out } + class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("eval", "VCF", vcf) + if ( dbSNP.exists() ) + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.doNotUseAllStandardStratifications = true + this.doNotUseAllStandardModules = true + this.evalModule = List("TiTvVariantEvaluator", "CountVariants") + this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression") + this.out = swapExt(vcf,".vcf",".eval") + if ( CALLING_INTERVAL != null ) + this.intervalsString = List(CALLING_INTERVAL); + } + + class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa { this.memoryLimit = 3 this.input_file = List(bam) @@ -107,6 +127,9 @@ class ReducedBAMEvaluation extends QScript { this.dcov = DCOV; this.o = outVCF + if ( dbSNP.exists() ) + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + if ( minimalVCF ) this.group = List("none")