ReducedBAM changes to downsample to a fixed coverage over the variable regions. Evaluation script now has filters and eval. commands.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5965 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fbb68ae94c
commit
44287ea8dc
|
|
@ -49,11 +49,11 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress
|
||||||
final int readContextSize,
|
final int readContextSize,
|
||||||
final GenomeLocParser glParser,
|
final GenomeLocParser glParser,
|
||||||
final int minBpForRunningConsensus,
|
final int minBpForRunningConsensus,
|
||||||
final int maxReadsAtVariableSites) {
|
final int targetDepthAtVariableSites) {
|
||||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||||
compressorsPerSample.put(name,
|
compressorsPerSample.put(name,
|
||||||
new SingleSampleConsensusReadCompressor(name, readContextSize,
|
new SingleSampleConsensusReadCompressor(name, readContextSize,
|
||||||
glParser, minBpForRunningConsensus, maxReadsAtVariableSites));
|
glParser, minBpForRunningConsensus, targetDepthAtVariableSites));
|
||||||
// todo -- argument for minConsensusSize
|
// todo -- argument for minConsensusSize
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
|
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
import org.apache.commons.math.stat.descriptive.summary.Sum;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
@ -67,7 +68,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
||||||
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
|
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
|
||||||
|
|
||||||
final int readContextSize;
|
final int readContextSize;
|
||||||
final int maxReadsAtVariableSites;
|
final int targetDepthAtVariableSites;
|
||||||
final int minBpForRunningConsensus;
|
final int minBpForRunningConsensus;
|
||||||
int retryTimer = 0;
|
int retryTimer = 0;
|
||||||
int consensusCounter = 0;
|
int consensusCounter = 0;
|
||||||
|
|
@ -82,11 +83,11 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
||||||
final int readContextSize,
|
final int readContextSize,
|
||||||
final GenomeLocParser glParser,
|
final GenomeLocParser glParser,
|
||||||
final int minBpForRunningConsensus,
|
final int minBpForRunningConsensus,
|
||||||
final int maxReadsAtVariableSites) {
|
final int targetDepthAtVariableSites) {
|
||||||
this.readContextSize = readContextSize;
|
this.readContextSize = readContextSize;
|
||||||
this.glParser = glParser;
|
this.glParser = glParser;
|
||||||
this.minBpForRunningConsensus = minBpForRunningConsensus;
|
this.minBpForRunningConsensus = minBpForRunningConsensus;
|
||||||
this.maxReadsAtVariableSites = maxReadsAtVariableSites;
|
this.targetDepthAtVariableSites = targetDepthAtVariableSites;
|
||||||
this.reducedReadGroup = createReducedReadGroup(sampleName);
|
this.reducedReadGroup = createReducedReadGroup(sampleName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -326,25 +327,54 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
|
||||||
if ( span.isConserved() )
|
if ( span.isConserved() )
|
||||||
reads.addAll(conservedSpanReads(sites, span));
|
reads.addAll(conservedSpanReads(sites, span));
|
||||||
else
|
else
|
||||||
reads.addAll(downsample(variableSpanReads(sites, span)));
|
reads.addAll(downsample(variableSpanReads(sites, span), span));
|
||||||
}
|
}
|
||||||
|
|
||||||
return reads;
|
return reads;
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo -- should be smart -- should take reads in some priority order
|
/**
|
||||||
// todo -- by length, and by strand, ideally. Perhaps alternating by strand
|
* Downsamples the reads until we have 2x the ideal target depth in the span.
|
||||||
// todo -- in order of length?
|
*
|
||||||
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads) {
|
* todo: perhaps it would be better to smooth coverage, so that the probability of
|
||||||
if ( reads.size() > maxReadsAtVariableSites ) {
|
* todo: retaining a read would be proportional to the over-coverage of each site
|
||||||
List<SAMRecord> readArray = new ArrayList<SAMRecord>(reads);
|
*
|
||||||
Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator());
|
* @param reads
|
||||||
return readArray.subList(0, maxReadsAtVariableSites);
|
* @param span
|
||||||
} else {
|
* @return
|
||||||
|
*/
|
||||||
|
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads, ConsensusSpan span) {
|
||||||
|
// ideally, we would have exactly span bp at target depth, x2 for the directionality of reads
|
||||||
|
int idealBPinSpan = span.size() * targetDepthAtVariableSites * 2;
|
||||||
|
int rawBPinSpan = readsBP(reads);
|
||||||
|
|
||||||
|
// The chance we want to keep a particular bp is ideal / actual
|
||||||
|
double pKeepPerBP = (1.0 * idealBPinSpan) / rawBPinSpan;
|
||||||
|
|
||||||
|
if ( pKeepPerBP >= 1.0 ) { // not enough coverage
|
||||||
return reads;
|
return reads;
|
||||||
|
} else { // we don'need to downsample
|
||||||
|
List<SAMRecord> downsampled = new ArrayList<SAMRecord>();
|
||||||
|
for ( SAMRecord read : reads ) {
|
||||||
|
// should this be proportional to read length?
|
||||||
|
double pKeep = pKeepPerBP; // * read.getReadLength();
|
||||||
|
if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < pKeep ) {
|
||||||
|
downsampled.add(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(String.format("targetDepth=%d, idealBP=%d, rawBP=%d, pKeepPerBP=%.2e, nRawReads=%d, nKeptReads=%d, keptBP=%d",
|
||||||
|
targetDepthAtVariableSites, idealBPinSpan, rawBPinSpan, pKeepPerBP, reads.size(), downsampled.size(), readsBP(downsampled)));
|
||||||
|
return downsampled;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final int readsBP(Collection<SAMRecord> reads) {
|
||||||
|
int sum = 0;
|
||||||
|
for ( SAMRecord read : reads ) sum += read.getReadLength();
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
private List<SAMRecord> conservedSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
|
private List<SAMRecord> conservedSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
|
||||||
byte[] bases = new byte[span.size()];
|
byte[] bases = new byte[span.size()];
|
||||||
byte[] quals = new byte[span.size()];
|
byte[] quals = new byte[span.size()];
|
||||||
|
|
|
||||||
|
|
@ -43,14 +43,26 @@ class ReducedBAMEvaluation extends QScript {
|
||||||
def script = {
|
def script = {
|
||||||
val reducedBAM = new ReduceBAM(bam)
|
val reducedBAM = new ReduceBAM(bam)
|
||||||
add(reducedBAM)
|
add(reducedBAM)
|
||||||
callAndEvaluateBAM(reducedBAM.out)
|
val reducedBAMVCF = callAndEvaluateBAM(reducedBAM.out)
|
||||||
|
|
||||||
val slicedBAM = new SliceBAM(bam)
|
val slicedBAM = new SliceBAM(bam)
|
||||||
add(slicedBAM)
|
add(slicedBAM)
|
||||||
callAndEvaluateBAM(slicedBAM.out)
|
val fullBAMVCF = callAndEvaluateBAM(slicedBAM.out)
|
||||||
|
|
||||||
|
val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS
|
||||||
|
combineCalls.rodBind :+= RodBind("fullBAM", "VCF", fullBAMVCF)
|
||||||
|
combineCalls.rodBind :+= RodBind("reducedBAM", "VCF", reducedBAMVCF)
|
||||||
|
combineCalls.rod_priority_list = "reducedBAM,fullBAM"
|
||||||
|
combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
|
||||||
|
combineCalls.out = swapExt(reducedBAM.out,".bam",".filtered.combined.vcf")
|
||||||
|
add(combineCalls)
|
||||||
|
val eval = new Eval(combineCalls.out) // evaluate the combined VCF
|
||||||
|
eval.select = List("'set==\"Intersection\"'", "'set==\"fullBAM\"'", "'set==\"reducedBAM\"'", "'set==\"filterInreducedBAM-fullBAM\"'", "'set==\"reducedBAM-filterInfullBAM\"'")
|
||||||
|
eval.selectName = List("Intersection", "fullBAM", "reducedBAM", "filterInreducedBAM-fullBAM", "reducedBAM-filterInfullBAM")
|
||||||
|
add(eval)
|
||||||
}
|
}
|
||||||
|
|
||||||
def callAndEvaluateBAM(bam: File) = {
|
def callAndEvaluateBAM(bam: File): File = {
|
||||||
val rawVCF = new Call(bam)
|
val rawVCF = new Call(bam)
|
||||||
add(rawVCF)
|
add(rawVCF)
|
||||||
|
|
||||||
|
|
@ -63,22 +75,30 @@ class ReducedBAMEvaluation extends QScript {
|
||||||
filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf")
|
filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf")
|
||||||
add(filterSNPs)
|
add(filterSNPs)
|
||||||
|
|
||||||
val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS
|
// create a variant eval for us
|
||||||
targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out)
|
add(new Eval(filterSNPs.out))
|
||||||
if ( dbSNP.exists() )
|
|
||||||
targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
|
||||||
targetEval.doNotUseAllStandardStratifications = true
|
|
||||||
targetEval.doNotUseAllStandardModules = true
|
|
||||||
targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
|
|
||||||
targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter")
|
|
||||||
targetEval.out = swapExt(filterSNPs.out,".vcf",".eval")
|
|
||||||
add(targetEval)
|
|
||||||
|
|
||||||
// for convenient diffing
|
// for convenient diffing
|
||||||
add(new DiffableTable(rawVCF.out))
|
add(new DiffableTable(rawVCF.out))
|
||||||
add(new DiffableTable(filterSNPs.out))
|
add(new DiffableTable(filterSNPs.out))
|
||||||
|
|
||||||
|
return filterSNPs.out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS {
|
||||||
|
this.rodBind :+= RodBind("eval", "VCF", vcf)
|
||||||
|
if ( dbSNP.exists() )
|
||||||
|
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||||
|
this.doNotUseAllStandardStratifications = true
|
||||||
|
this.doNotUseAllStandardModules = true
|
||||||
|
this.evalModule = List("TiTvVariantEvaluator", "CountVariants")
|
||||||
|
this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression")
|
||||||
|
this.out = swapExt(vcf,".vcf",".eval")
|
||||||
|
if ( CALLING_INTERVAL != null )
|
||||||
|
this.intervalsString = List(CALLING_INTERVAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa {
|
class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa {
|
||||||
this.memoryLimit = 3
|
this.memoryLimit = 3
|
||||||
this.input_file = List(bam)
|
this.input_file = List(bam)
|
||||||
|
|
@ -107,6 +127,9 @@ class ReducedBAMEvaluation extends QScript {
|
||||||
this.dcov = DCOV;
|
this.dcov = DCOV;
|
||||||
this.o = outVCF
|
this.o = outVCF
|
||||||
|
|
||||||
|
if ( dbSNP.exists() )
|
||||||
|
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||||
|
|
||||||
if ( minimalVCF )
|
if ( minimalVCF )
|
||||||
this.group = List("none")
|
this.group = List("none")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue