The maximum kmer length is derived from the reads.

-- This is done to take advantage of longer reads which can produce less ambiguous haplotypes
-- Integration tests change for HC and BiasedDownsampling
This commit is contained in:
Ryan Poplin 2013-02-25 14:00:38 -05:00
parent bd9875aff5
commit 89e2943dd1
5 changed files with 47 additions and 17 deletions

View File

@ -75,9 +75,8 @@ import java.util.*;
public class DeBruijnAssembler extends LocalAssemblyEngine {
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 12;
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
private static final byte MIN_QUALITY = (byte) 16;
private static final int MAX_POSSIBLE_KMER = 75;
private static final int GRAPH_KMER_STEP = 6;
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
@ -136,7 +135,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear();
final int maxKmer = Math.min(MAX_POSSIBLE_KMER, refHaplotype.getBases().length - KMER_OVERLAP);
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < MIN_KMER ) { throw new IllegalStateException("Reads are too small for use in assembly."); }
// create the graph for each possible kmer
for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );

View File

@ -283,17 +283,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
@Test
public void testHCFlatContaminationCase1() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "0b9d6aabd5ab448f0a2d32f24ff64840");
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "9dbd17769e091ce759efda050cd4f8b2");
}
@Test
public void testHCFlatContaminationCase2() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "a4ef4a6ce557a6b9666e234fad5c7c80");
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "b8cee98c9c693fd336fc5e574dd744ed");
}
@Test
public void testHCFlatContaminationCase3() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "bacc98eb2baa5bb1777da24cf0f84913");
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "e7309bd594b8e4b54b712f9877518b8e");
}
}

View File

@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "ecf563b63ca3f640d9cfcc548e8ad776");
HCTest(CEUTRIO_BAM, "", "a9748a39604c4ec8bbdb2cb809a971f1");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "874389182141f41879abea7cb350c9d4");
HCTest(NA12878_BAM, "", "c55ebed976767e1f93d2e8ada9d52bf8");
}
@Test(enabled = false)
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"4aa3d0d0a859c0fc0533f29529cc3d95");
"70a53e566e6a7090e2f29ed608e9d84f");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"cfd717dd79ace99a266e8bb58d6cc7a6");
"10fdbfeb3b4ea1af7f242a8aca83cb9b");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "58b484324f0ea00aaac25fb7711ad657");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -136,7 +136,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "0e8a3a31b8fe5f097d6975aee8b67cdc");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f1f867dbbe3747f16a0d9e5f11e6ed64");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2acd853da3a0380650de6827b7c790ac"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("061a95cab149723866ce7c797ba6bdd4"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a17e95c1191e3aef7892586fe38ca050"));
executeTest("HCTestStructuralIndels: ", spec);
}

View File

@ -913,4 +913,19 @@ public class ReadUtils {
return getBasesReverseComplement(read.getReadBases());
}
/**
* Calculate the maximum read length from the given list of reads.
* @param reads list of reads
* @return non-negative integer
*/
@Ensures({"result >= 0"})
public static int getMaxReadLength( final List<GATKSAMRecord> reads ) {
if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); }
int maxReadLength = 0;
for( final GATKSAMRecord read : reads ) {
maxReadLength = Math.max(maxReadLength, read.getReadLength());
}
return maxReadLength;
}
}

View File

@ -32,9 +32,7 @@ import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.*;
public class ReadUtilsUnitTest extends BaseTest {
@ -165,4 +163,20 @@ public class ReadUtilsUnitTest extends BaseTest {
Assert.assertEquals(reconverted, original);
}
}
@Test (enabled = true)
public void testGetMaxReadLength() {
for( final int minLength : Arrays.asList( 5, 30, 50 ) ) {
for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) {
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for( int readLength = minLength; readLength <= maxLength; readLength++ ) {
reads.add( ReadUtils.createRandomRead( readLength ) );
}
Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match");
}
}
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero");
}
}