ReducedBAM changes to downsample to a fixed coverage over the variable regions. Evaluation script now has filters and eval. commands.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5965 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-08 19:36:08 +00:00
parent fbb68ae94c
commit 44287ea8dc
3 changed files with 81 additions and 28 deletions

View File

@ -49,11 +49,11 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress
final int readContextSize,
final GenomeLocParser glParser,
final int minBpForRunningConsensus,
final int maxReadsAtVariableSites) {
final int targetDepthAtVariableSites) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleConsensusReadCompressor(name, readContextSize,
glParser, minBpForRunningConsensus, maxReadsAtVariableSites));
glParser, minBpForRunningConsensus, targetDepthAtVariableSites));
// todo -- argument for minConsensusSize
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
import net.sf.samtools.*;
import org.apache.commons.math.stat.descriptive.summary.Sum;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
@ -67,7 +68,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
final int readContextSize;
final int maxReadsAtVariableSites;
final int targetDepthAtVariableSites;
final int minBpForRunningConsensus;
int retryTimer = 0;
int consensusCounter = 0;
@ -82,11 +83,11 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
final int readContextSize,
final GenomeLocParser glParser,
final int minBpForRunningConsensus,
final int maxReadsAtVariableSites) {
final int targetDepthAtVariableSites) {
this.readContextSize = readContextSize;
this.glParser = glParser;
this.minBpForRunningConsensus = minBpForRunningConsensus;
this.maxReadsAtVariableSites = maxReadsAtVariableSites;
this.targetDepthAtVariableSites = targetDepthAtVariableSites;
this.reducedReadGroup = createReducedReadGroup(sampleName);
}
@ -326,25 +327,54 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
if ( span.isConserved() )
reads.addAll(conservedSpanReads(sites, span));
else
reads.addAll(downsample(variableSpanReads(sites, span)));
reads.addAll(downsample(variableSpanReads(sites, span), span));
}
return reads;
}
// todo -- should be smart -- should take reads in some priority order
// todo -- by length, and by strand, ideally. Perhaps alternating by strand
// todo -- in order of length?
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads) {
if ( reads.size() > maxReadsAtVariableSites ) {
List<SAMRecord> readArray = new ArrayList<SAMRecord>(reads);
Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator());
return readArray.subList(0, maxReadsAtVariableSites);
} else {
/**
* Downsamples the reads until we have 2x the ideal target depth in the span.
*
* todo: perhaps it would be better to smooth coverage, so that the probability of
* todo: retaining a read would be proportional to the over-coverage of each site
*
* @param reads
* @param span
* @return
*/
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads, ConsensusSpan span) {
// ideally, we would have exactly span bp at target depth, x2 for the directionality of reads
int idealBPinSpan = span.size() * targetDepthAtVariableSites * 2;
int rawBPinSpan = readsBP(reads);
// The chance we want to keep a particular bp is ideal / actual
double pKeepPerBP = (1.0 * idealBPinSpan) / rawBPinSpan;
if ( pKeepPerBP >= 1.0 ) { // not enough coverage
return reads;
} else { // we don'need to downsample
List<SAMRecord> downsampled = new ArrayList<SAMRecord>();
for ( SAMRecord read : reads ) {
// should this be proportional to read length?
double pKeep = pKeepPerBP; // * read.getReadLength();
if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < pKeep ) {
downsampled.add(read);
}
}
logger.info(String.format("targetDepth=%d, idealBP=%d, rawBP=%d, pKeepPerBP=%.2e, nRawReads=%d, nKeptReads=%d, keptBP=%d",
targetDepthAtVariableSites, idealBPinSpan, rawBPinSpan, pKeepPerBP, reads.size(), downsampled.size(), readsBP(downsampled)));
return downsampled;
}
}
private static final int readsBP(Collection<SAMRecord> reads) {
int sum = 0;
for ( SAMRecord read : reads ) sum += read.getReadLength();
return sum;
}
private List<SAMRecord> conservedSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
byte[] bases = new byte[span.size()];
byte[] quals = new byte[span.size()];

View File

@ -43,14 +43,26 @@ class ReducedBAMEvaluation extends QScript {
def script = {
val reducedBAM = new ReduceBAM(bam)
add(reducedBAM)
callAndEvaluateBAM(reducedBAM.out)
val reducedBAMVCF = callAndEvaluateBAM(reducedBAM.out)
val slicedBAM = new SliceBAM(bam)
add(slicedBAM)
callAndEvaluateBAM(slicedBAM.out)
val fullBAMVCF = callAndEvaluateBAM(slicedBAM.out)
val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS
combineCalls.rodBind :+= RodBind("fullBAM", "VCF", fullBAMVCF)
combineCalls.rodBind :+= RodBind("reducedBAM", "VCF", reducedBAMVCF)
combineCalls.rod_priority_list = "reducedBAM,fullBAM"
combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
combineCalls.out = swapExt(reducedBAM.out,".bam",".filtered.combined.vcf")
add(combineCalls)
val eval = new Eval(combineCalls.out) // evaluate the combined VCF
eval.select = List("'set==\"Intersection\"'", "'set==\"fullBAM\"'", "'set==\"reducedBAM\"'", "'set==\"filterInreducedBAM-fullBAM\"'", "'set==\"reducedBAM-filterInfullBAM\"'")
eval.selectName = List("Intersection", "fullBAM", "reducedBAM", "filterInreducedBAM-fullBAM", "reducedBAM-filterInfullBAM")
add(eval)
}
def callAndEvaluateBAM(bam: File) = {
def callAndEvaluateBAM(bam: File): File = {
val rawVCF = new Call(bam)
add(rawVCF)
@ -63,22 +75,30 @@ class ReducedBAMEvaluation extends QScript {
filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf")
add(filterSNPs)
val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS
targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out)
if ( dbSNP.exists() )
targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
targetEval.doNotUseAllStandardStratifications = true
targetEval.doNotUseAllStandardModules = true
targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter")
targetEval.out = swapExt(filterSNPs.out,".vcf",".eval")
add(targetEval)
// create a variant eval for us
add(new Eval(filterSNPs.out))
// for convenient diffing
add(new DiffableTable(rawVCF.out))
add(new DiffableTable(filterSNPs.out))
return filterSNPs.out
}
class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("eval", "VCF", vcf)
if ( dbSNP.exists() )
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.doNotUseAllStandardStratifications = true
this.doNotUseAllStandardModules = true
this.evalModule = List("TiTvVariantEvaluator", "CountVariants")
this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression")
this.out = swapExt(vcf,".vcf",".eval")
if ( CALLING_INTERVAL != null )
this.intervalsString = List(CALLING_INTERVAL);
}
class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa {
this.memoryLimit = 3
this.input_file = List(bam)
@ -107,6 +127,9 @@ class ReducedBAMEvaluation extends QScript {
this.dcov = DCOV;
this.o = outVCF
if ( dbSNP.exists() )
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
if ( minimalVCF )
this.group = List("none")