ReducedBAM changes to downsample to a fixed coverage over the variable regions. Evaluation script now has filters and eval. commands.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5965 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fbb68ae94c
commit
44287ea8dc
|
|
@ -49,11 +49,11 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress
|
|||
final int readContextSize,
|
||||
final GenomeLocParser glParser,
|
||||
final int minBpForRunningConsensus,
|
||||
final int maxReadsAtVariableSites) {
|
||||
final int targetDepthAtVariableSites) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleConsensusReadCompressor(name, readContextSize,
|
||||
glParser, minBpForRunningConsensus, maxReadsAtVariableSites));
|
||||
glParser, minBpForRunningConsensus, targetDepthAtVariableSites));
|
||||
// todo -- argument for minConsensusSize
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.commons.math.stat.descriptive.summary.Sum;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -67,7 +68,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
|||
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
|
||||
|
||||
final int readContextSize;
|
||||
final int maxReadsAtVariableSites;
|
||||
final int targetDepthAtVariableSites;
|
||||
final int minBpForRunningConsensus;
|
||||
int retryTimer = 0;
|
||||
int consensusCounter = 0;
|
||||
|
|
@ -82,11 +83,11 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
|||
final int readContextSize,
|
||||
final GenomeLocParser glParser,
|
||||
final int minBpForRunningConsensus,
|
||||
final int maxReadsAtVariableSites) {
|
||||
final int targetDepthAtVariableSites) {
|
||||
this.readContextSize = readContextSize;
|
||||
this.glParser = glParser;
|
||||
this.minBpForRunningConsensus = minBpForRunningConsensus;
|
||||
this.maxReadsAtVariableSites = maxReadsAtVariableSites;
|
||||
this.targetDepthAtVariableSites = targetDepthAtVariableSites;
|
||||
this.reducedReadGroup = createReducedReadGroup(sampleName);
|
||||
}
|
||||
|
||||
|
|
@ -326,25 +327,54 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
|||
if ( span.isConserved() )
|
||||
reads.addAll(conservedSpanReads(sites, span));
|
||||
else
|
||||
reads.addAll(downsample(variableSpanReads(sites, span)));
|
||||
reads.addAll(downsample(variableSpanReads(sites, span), span));
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
// todo -- should be smart -- should take reads in some priority order
|
||||
// todo -- by length, and by strand, ideally. Perhaps alternating by strand
|
||||
// todo -- in order of length?
|
||||
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads) {
|
||||
if ( reads.size() > maxReadsAtVariableSites ) {
|
||||
List<SAMRecord> readArray = new ArrayList<SAMRecord>(reads);
|
||||
Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator());
|
||||
return readArray.subList(0, maxReadsAtVariableSites);
|
||||
} else {
|
||||
/**
|
||||
* Downsamples the reads until we have 2x the ideal target depth in the span.
|
||||
*
|
||||
* todo: perhaps it would be better to smooth coverage, so that the probability of
|
||||
* todo: retaining a read would be proportional to the over-coverage of each site
|
||||
*
|
||||
* @param reads
|
||||
* @param span
|
||||
* @return
|
||||
*/
|
||||
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads, ConsensusSpan span) {
|
||||
// ideally, we would have exactly span bp at target depth, x2 for the directionality of reads
|
||||
int idealBPinSpan = span.size() * targetDepthAtVariableSites * 2;
|
||||
int rawBPinSpan = readsBP(reads);
|
||||
|
||||
// The chance we want to keep a particular bp is ideal / actual
|
||||
double pKeepPerBP = (1.0 * idealBPinSpan) / rawBPinSpan;
|
||||
|
||||
if ( pKeepPerBP >= 1.0 ) { // not enough coverage
|
||||
return reads;
|
||||
} else { // we don'need to downsample
|
||||
List<SAMRecord> downsampled = new ArrayList<SAMRecord>();
|
||||
for ( SAMRecord read : reads ) {
|
||||
// should this be proportional to read length?
|
||||
double pKeep = pKeepPerBP; // * read.getReadLength();
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < pKeep ) {
|
||||
downsampled.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(String.format("targetDepth=%d, idealBP=%d, rawBP=%d, pKeepPerBP=%.2e, nRawReads=%d, nKeptReads=%d, keptBP=%d",
|
||||
targetDepthAtVariableSites, idealBPinSpan, rawBPinSpan, pKeepPerBP, reads.size(), downsampled.size(), readsBP(downsampled)));
|
||||
return downsampled;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int readsBP(Collection<SAMRecord> reads) {
|
||||
int sum = 0;
|
||||
for ( SAMRecord read : reads ) sum += read.getReadLength();
|
||||
return sum;
|
||||
}
|
||||
|
||||
private List<SAMRecord> conservedSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
|
||||
byte[] bases = new byte[span.size()];
|
||||
byte[] quals = new byte[span.size()];
|
||||
|
|
|
|||
|
|
@ -43,14 +43,26 @@ class ReducedBAMEvaluation extends QScript {
|
|||
def script = {
|
||||
val reducedBAM = new ReduceBAM(bam)
|
||||
add(reducedBAM)
|
||||
callAndEvaluateBAM(reducedBAM.out)
|
||||
val reducedBAMVCF = callAndEvaluateBAM(reducedBAM.out)
|
||||
|
||||
val slicedBAM = new SliceBAM(bam)
|
||||
add(slicedBAM)
|
||||
callAndEvaluateBAM(slicedBAM.out)
|
||||
val fullBAMVCF = callAndEvaluateBAM(slicedBAM.out)
|
||||
|
||||
val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS
|
||||
combineCalls.rodBind :+= RodBind("fullBAM", "VCF", fullBAMVCF)
|
||||
combineCalls.rodBind :+= RodBind("reducedBAM", "VCF", reducedBAMVCF)
|
||||
combineCalls.rod_priority_list = "reducedBAM,fullBAM"
|
||||
combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
|
||||
combineCalls.out = swapExt(reducedBAM.out,".bam",".filtered.combined.vcf")
|
||||
add(combineCalls)
|
||||
val eval = new Eval(combineCalls.out) // evaluate the combined VCF
|
||||
eval.select = List("'set==\"Intersection\"'", "'set==\"fullBAM\"'", "'set==\"reducedBAM\"'", "'set==\"filterInreducedBAM-fullBAM\"'", "'set==\"reducedBAM-filterInfullBAM\"'")
|
||||
eval.selectName = List("Intersection", "fullBAM", "reducedBAM", "filterInreducedBAM-fullBAM", "reducedBAM-filterInfullBAM")
|
||||
add(eval)
|
||||
}
|
||||
|
||||
def callAndEvaluateBAM(bam: File) = {
|
||||
def callAndEvaluateBAM(bam: File): File = {
|
||||
val rawVCF = new Call(bam)
|
||||
add(rawVCF)
|
||||
|
||||
|
|
@ -63,22 +75,30 @@ class ReducedBAMEvaluation extends QScript {
|
|||
filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf")
|
||||
add(filterSNPs)
|
||||
|
||||
val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS
|
||||
targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out)
|
||||
if ( dbSNP.exists() )
|
||||
targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||
targetEval.doNotUseAllStandardStratifications = true
|
||||
targetEval.doNotUseAllStandardModules = true
|
||||
targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
|
||||
targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter")
|
||||
targetEval.out = swapExt(filterSNPs.out,".vcf",".eval")
|
||||
add(targetEval)
|
||||
// create a variant eval for us
|
||||
add(new Eval(filterSNPs.out))
|
||||
|
||||
// for convenient diffing
|
||||
add(new DiffableTable(rawVCF.out))
|
||||
add(new DiffableTable(filterSNPs.out))
|
||||
|
||||
return filterSNPs.out
|
||||
}
|
||||
|
||||
class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS {
|
||||
this.rodBind :+= RodBind("eval", "VCF", vcf)
|
||||
if ( dbSNP.exists() )
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||
this.doNotUseAllStandardStratifications = true
|
||||
this.doNotUseAllStandardModules = true
|
||||
this.evalModule = List("TiTvVariantEvaluator", "CountVariants")
|
||||
this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression")
|
||||
this.out = swapExt(vcf,".vcf",".eval")
|
||||
if ( CALLING_INTERVAL != null )
|
||||
this.intervalsString = List(CALLING_INTERVAL);
|
||||
}
|
||||
|
||||
|
||||
class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa {
|
||||
this.memoryLimit = 3
|
||||
this.input_file = List(bam)
|
||||
|
|
@ -107,6 +127,9 @@ class ReducedBAMEvaluation extends QScript {
|
|||
this.dcov = DCOV;
|
||||
this.o = outVCF
|
||||
|
||||
if ( dbSNP.exists() )
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||
|
||||
if ( minimalVCF )
|
||||
this.group = List("none")
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue