diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index ba1da7c87..02821ab50 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -78,54 +77,46 @@ public class AlleleBiasedDownsamplingUtils { for ( int i = 0; i < 4; i++ ) alleleStratifiedElements[i] = new ArrayList(); - // keep all of the reduced reads - final ArrayList reducedReadPileups = new ArrayList(); - // start by stratifying the reads by the alleles they represent at this position - for( final PileupElement pe : pileup ) { + for ( final PileupElement pe : pileup ) { // we do not want to remove a reduced read - if ( pe.getRead().isReducedRead() ) { - reducedReadPileups.add(pe); - } else { + if ( !pe.getRead().isReducedRead() ) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if ( baseIndex != -1 ) alleleStratifiedElements[baseIndex].add(pe); } } - // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. - int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor - final TreeSet elementsToKeep = new TreeSet(new Comparator() { - @Override - public int compare(PileupElement element1, PileupElement element2) { - final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); - return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); - } - }); - elementsToKeep.addAll(reducedReadPileups); - // make a listing of allele counts final int[] alleleCounts = new int[4]; for ( int i = 0; i < 4; i++ ) alleleCounts[i] = alleleStratifiedElements[i].size(); // do smart down-sampling + int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + final HashSet readsToRemove = new HashSet(numReadsToRemove); for ( int i = 0; i < 4; i++ ) { final ArrayList alleleList = alleleStratifiedElements[i]; - // if we don't need to remove any reads, keep them all - if ( alleleList.size() <= targetAlleleCounts[i] ) - elementsToKeep.addAll(alleleList); - else - elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); + // if we don't need to remove any reads, then don't + if ( alleleList.size() > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); } // clean up pointers so memory can be garbage collected if needed for ( int i = 0; i < 4; i++ ) alleleStratifiedElements[i].clear(); - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise + final List readsToKeep = new ArrayList(pileup.getNumberOfElements() - numReadsToRemove); + for ( final PileupElement pe : pileup ) { + if ( !readsToRemove.contains(pe) ) { + readsToKeep.add(pe); + } + } + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); } private static int scoreAlleleCounts(final int[] alleleCounts) { @@ -189,37 +180,43 @@ public class AlleleBiasedDownsamplingUtils { } /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to keep + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove * * @param elements original list of records * @param numElementsToRemove the number of records to remove * @param log logging output - * @return the list of pileup elements TO KEEP + * @return the list of pileup elements TO REMOVE */ - private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { - if ( numElementsToRemove == 0 ) - return elements; + private static List downsampleElements(final List elements, final int numElementsToRemove, final PrintStream log) { + ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return elementsToRemove; + + // should we remove all of the elements? final int pileupSize = elements.size(); if ( numElementsToRemove == pileupSize ) { logAllElements(elements, log); - return new ArrayList(0); + elementsToRemove.addAll(elements); + return elementsToRemove; } + // create a bitset describing which elements to remove final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); for ( int i = 0; i < pileupSize; i++ ) { - if ( itemsToRemove.get(i) ) - logRead(elements.get(i).getRead(), log); - else - elementsToKeep.add(elements.get(i)); + if ( itemsToRemove.get(i) ) { + final T element = elements.get(i); + logElement(element, log); + elementsToRemove.add(element); + } } - return elementsToKeep; + return elementsToRemove; } /** @@ -253,65 +250,30 @@ public class AlleleBiasedDownsamplingUtils { final List alleleBin = alleleReadMap.get(alleles.get(i)); if ( alleleBin.size() > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); + readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); } } return readsToRemove; } - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param reads original list of records - * @param numElementsToRemove the number of records to remove - * @param log logging output - * @return the list of pileup elements TO REMOVE - */ - private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { - final ArrayList readsToRemove = new ArrayList(numElementsToRemove); - - if ( numElementsToRemove == 0 ) - return readsToRemove; - - final int pileupSize = reads.size(); - if ( numElementsToRemove == pileupSize ) { - logAllReads(reads, log); - return reads; - } - - final BitSet itemsToRemove = new BitSet(pileupSize); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - for ( int i = 0; i < pileupSize; i++ ) { - if ( itemsToRemove.get(i) ) { - final GATKSAMRecord read = reads.get(i); - readsToRemove.add(read); - logRead(read, log); + private static void logAllElements(final List elements, final PrintStream log) { + if ( log != null ) { + for ( final T obj : elements ) { + logElement(obj, log); } } - - return readsToRemove; } - private static void logAllElements(final List elements, final PrintStream log) { + private static void logElement(final T obj, final PrintStream log) { if ( log != null ) { - for ( final PileupElement p : elements ) - logRead(p.getRead(), log); - } - } - private static void logAllReads(final List reads, final PrintStream log) { - if ( log != null ) { - for ( final GATKSAMRecord read : reads ) - logRead(read, log); - } - } + final GATKSAMRecord read; + if ( obj instanceof PileupElement ) + read = ((PileupElement)obj).getRead(); + else + read = (GATKSAMRecord)obj; - private static void logRead(final SAMRecord read, final PrintStream log) { - if ( log != null ) { final SAMReadGroupRecord readGroup = read.getReadGroup(); log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 5b5a75d4e..45a42d018 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e")); + Arrays.asList("35479a79e1ce7c15493bd77e58cadcaa")); executeTest("test Multiple SNP alleles", spec); } @@ -238,12 +238,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209"); } private void testOutputParameters(final String args, final String md5) { @@ -497,13 +497,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("02175dc9731aed92837ce0db78489fc0")); + Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24"); + testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 27fe31fa7..939b9873c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "1a034b7eb572e1b6f659d6e5d57b3e76"); + "d590c8d6d5e58d685401b65a23846893"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("31db0a2d9eb07f86e0a89f0d97169072")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); }