From 89e2943dd19e4a96cc5a766a86e7255427e9bb14 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 25 Feb 2013 14:00:38 -0500 Subject: [PATCH] The maximum kmer length is derived from the reads. -- This is done to take advantage of longer reads which can produce less ambiguous haplotypes -- Integration tests change for HC and BiasedDownsampling --- .../haplotypecaller/DeBruijnAssembler.java | 7 ++++--- .../BiasedDownsamplingIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 16 +++++++-------- .../sting/utils/sam/ReadUtils.java | 15 ++++++++++++++ .../sting/utils/sam/ReadUtilsUnitTest.java | 20 ++++++++++++++++--- 5 files changed, 47 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 087d526da..92962f67f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -75,9 +75,8 @@ import java.util.*; public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 12; + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; private static final byte MIN_QUALITY = (byte) 16; - private static final int MAX_POSSIBLE_KMER = 75; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -136,7 +135,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { graphs.clear(); - final int maxKmer = Math.min(MAX_POSSIBLE_KMER, refHaplotype.getBases().length - KMER_OVERLAP); + final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; + if( maxKmer < MIN_KMER ) { throw new IllegalStateException("Reads are too small for use in assembly."); } + // create the graph for each possible kmer for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java index 794ee8dee..08428b5aa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java @@ -283,17 +283,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { @Test public void testHCFlatContaminationCase1() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "0b9d6aabd5ab448f0a2d32f24ff64840"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "9dbd17769e091ce759efda050cd4f8b2"); } @Test public void testHCFlatContaminationCase2() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "a4ef4a6ce557a6b9666e234fad5c7c80"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "b8cee98c9c693fd336fc5e574dd744ed"); } @Test public void testHCFlatContaminationCase3() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "bacc98eb2baa5bb1777da24cf0f84913"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "e7309bd594b8e4b54b712f9877518b8e"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 489cab95a..05a7ce3be 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "ecf563b63ca3f640d9cfcc548e8ad776"); + HCTest(CEUTRIO_BAM, "", "a9748a39604c4ec8bbdb2cb809a971f1"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "874389182141f41879abea7cb350c9d4"); + HCTest(NA12878_BAM, "", "c55ebed976767e1f93d2e8ada9d52bf8"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "4aa3d0d0a859c0fc0533f29529cc3d95"); + "70a53e566e6a7090e2f29ed608e9d84f"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "cfd717dd79ace99a266e8bb58d6cc7a6"); + "10fdbfeb3b4ea1af7f242a8aca83cb9b"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "58b484324f0ea00aaac25fb7711ad657"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -136,7 +136,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "0e8a3a31b8fe5f097d6975aee8b67cdc"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f1f867dbbe3747f16a0d9e5f11e6ed64"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2acd853da3a0380650de6827b7c790ac")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("061a95cab149723866ce7c797ba6bdd4")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a17e95c1191e3aef7892586fe38ca050")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 39d058aea..709afeef5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -913,4 +913,19 @@ public class ReadUtils { return getBasesReverseComplement(read.getReadBases()); } + /** + * Calculate the maximum read length from the given list of reads. + * @param reads list of reads + * @return non-negative integer + */ + @Ensures({"result >= 0"}) + public static int getMaxReadLength( final List reads ) { + if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); } + + int maxReadLength = 0; + for( final GATKSAMRecord read : reads ) { + maxReadLength = Math.max(maxReadLength, read.getReadLength()); + } + return maxReadLength; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index b01c53e77..baad67d53 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -32,9 +32,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; +import java.util.*; public class ReadUtilsUnitTest extends BaseTest { @@ -165,4 +163,20 @@ public class ReadUtilsUnitTest extends BaseTest { Assert.assertEquals(reconverted, original); } } + + @Test (enabled = true) + public void testGetMaxReadLength() { + for( final int minLength : Arrays.asList( 5, 30, 50 ) ) { + for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) { + final List reads = new ArrayList(); + for( int readLength = minLength; readLength <= maxLength; readLength++ ) { + reads.add( ReadUtils.createRandomRead( readLength ) ); + } + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); + } + } + + final List reads = new LinkedList(); + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); + } }