Breaking up my massive commits into smaller pieces that I can successfully merge and digest. This one enables downsampling in the HaplotypeCaller (by lowering the default dcov to 20) and removes my long-standing, temporary region-based downsampling.

This commit is contained in:
Ryan Poplin 2013-01-30 16:14:07 -05:00
parent 591df2be44
commit 5f4a063def
2 changed files with 16 additions and 44 deletions

View File

@ -56,6 +56,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -132,7 +133,7 @@ import java.util.*;
@PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ActiveRegionTraversalParameters(extension=65, maxRegion=300)
//@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=5)
@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=20)
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
/**
@ -180,9 +181,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
protected int minKmer = 11;
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
/**
* If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
* when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
@ -463,7 +461,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
allelesToGenotype.removeAll( activeAllelesToGenotype );
}
if( !activeRegion.isActive()) { return 0; } // Not active so nothing to do!
if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do!
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
@ -474,7 +472,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype );
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
@ -571,18 +568,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
}
Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator());
// Loop through the reads hard clipping the adaptor and low quality tails
final ArrayList<GATKSAMRecord> readsToUse = new ArrayList<GATKSAMRecord>(finalizedReadList.size());
for( final GATKSAMRecord myRead : finalizedReadList ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// protect against INTERVALS with abnormally high coverage
// TODO BUGBUG: remove when positional downsampler is hooked up to ART/HC
if( activeRegion.readOverlapsRegion(clippedRead) &&
clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
readsToUse.add(clippedRead);
}
}
@ -711,24 +703,4 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
return new Cigar(readCigarElements);
}
/*
private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) {
final ArrayList<Integer> readLengthDistribution = new ArrayList<Integer>();
for( final GATKSAMRecord read : activeRegion.getReads() ) {
readLengthDistribution.add(read.getReadLength());
}
final double meanReadLength = MathUtils.average(readLengthDistribution);
final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size();
int PRUNE_FACTOR = 0;
if( meanCoveragePerSample > 8.5 ) {
PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) );
} else if( meanCoveragePerSample > 3.0 ) {
PRUNE_FACTOR = 1;
}
if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); }
return PRUNE_FACTOR;
}
*/
}

View File

@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "664a14590d0966e63d3aabff2d7bab0a");
HCTest(CEUTRIO_BAM, "", "72ce6a5e46644dfd73aeffba9d6131ea");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "111f3dc086a3cea1be9bd1ad6e1d18ed");
HCTest(NA12878_BAM, "", "f9d696391f1f337092d70e3abcd32bfb");
}
@Test(enabled = false)
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"c70f753f7918a1c670ce4ed5c66de09e");
"4e8beb2cdc3d77427f14acf37cea2bd0");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"b1d3070f0c49becf34101e480ab6c589");
"75e1df0dcf3728fd2b6e4735c4cc88ce");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"20eba2e54266f6aebf35b7b7b7e754e3");
"1d244f2adbc72a0062eb673d56cbb5a8");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9805488d85e59e1ae002d0d32d7011d");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a1bc844f62a9cb60dbb70d00ad36b85d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "4544a2916f46f58b32db8008776b91a3");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "23956e572f19ff26d25bbdfaa307675b");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f3984a91e7562494c2a7e41fd05a6734");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1255f466aa2d288f015cd55d8fece1ac");
}
// That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3e9e025c539be6c5e0d0f2e5d8e86a62"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("103c91c4a78164949e166d3d27eb459b"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("34129e6c6310ef4eeeeb59b0e7ac0464"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("87fe31a4bbd68a9eb5d5910db5011c82"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("5f4c07aaf1d2d34cccce43196a5fbd71"));
Arrays.asList("0fa19ec5cf737a3445544b59ecc995e9"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6ead001b1f8e7cb433fd335f78fde5f0"));
Arrays.asList("5f4cbdcc9bffee6bba258dfac89492ed"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
}