diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 59357e1c4..1a7b4da51 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -67,7 +67,6 @@ public class AlleleBiasedDownsamplingUtils { alleleStratifiedElements[baseIndex].add(pe); } - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor final TreeSet elementsToKeep = new TreeSet(new Comparator() { @@ -78,12 +77,21 @@ public class AlleleBiasedDownsamplingUtils { } }); + // make a listing of allele counts + final int[] alleleCounts = new int[4]; + for ( int i = 0; i < 4; i++ ) + alleleCounts[i] = alleleStratifiedElements[i].size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + for ( int i = 0; i < 4; i++ ) { final ArrayList alleleList = alleleStratifiedElements[i]; - if ( alleleList.size() <= numReadsToRemove ) - logAllElements(alleleList, log); + // if we don't need to remove any reads, keep them all + if ( alleleList.size() <= targetAlleleCounts[i] ) + elementsToKeep.addAll(alleleList); else - elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log)); + elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); } // clean up pointers so memory can be garbage collected if needed @@ -93,6 +101,59 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); } + private static int scoreAlleleCounts(final int[] alleleCounts) { + final int maxIndex = MathUtils.maxElementIndex(alleleCounts); + final int maxCount = alleleCounts[maxIndex]; + + int nonMaxCount = 0; + for ( int i = 0; i < 4; i++ ) { + if ( i != maxIndex ) + nonMaxCount += alleleCounts[i]; + } + + // try to get the best score: in the het case the counts should be equal and in the hom case the non-max should be zero + return Math.min(Math.abs(maxCount - nonMaxCount), Math.abs(nonMaxCount)); + } + + /** + * Computes an allele biased version of the given pileup + * + * @param alleleCounts the original pileup + * @param numReadsToRemove fraction of total reads to remove per allele + * @return allele biased pileup + */ + protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { + final int numAlleles = alleleCounts.length; + + int maxScore = scoreAlleleCounts(alleleCounts); + int[] alleleCountsOfMax = alleleCounts; + + final int numReadsToRemovePerAllele = numReadsToRemove / 2; + + for ( int i = 0; i < numAlleles; i++ ) { + for ( int j = i; j < numAlleles; j++ ) { + final int[] newCounts = alleleCounts.clone(); + + // split these cases so we don't lose on the floor (since we divided by 2) + if ( i == j ) { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); + } else { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); + newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); + } + + final int score = scoreAlleleCounts(newCounts); + + if ( score < maxScore ) { + maxScore = score; + alleleCountsOfMax = newCounts; + } + } + } + + return alleleCountsOfMax; + } + /** * Performs allele biased down-sampling on a pileup and computes the list of elements to keep * @@ -102,7 +163,15 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO KEEP */ private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { + if ( numElementsToRemove == 0 ) + return elements; + final int pileupSize = elements.size(); + if ( numElementsToRemove == pileupSize ) { + logAllElements(elements, log); + return new ArrayList(0); + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); @@ -132,15 +201,25 @@ public class AlleleBiasedDownsamplingUtils { for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. int numReadsToRemove = (int)(totalReads * downsamplingFraction); - final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size()); - for ( final List reads : alleleReadMap.values() ) { - if ( reads.size() <= numReadsToRemove ) { - readsToRemove.addAll(reads); - logAllReads(reads, log); - } else { - readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log)); + + // make a listing of allele counts + final List alleles = new ArrayList(alleleReadMap.keySet()); + alleles.remove(Allele.NO_CALL); // ignore the no-call bin + final int numAlleles = alleles.size(); + final int[] alleleCounts = new int[numAlleles]; + for ( int i = 0; i < numAlleles; i++ ) + alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final List readsToRemove = new ArrayList(numReadsToRemove); + for ( int i = 0; i < numAlleles; i++ ) { + final List alleleBin = alleleReadMap.get(alleles.get(i)); + + if ( alleleBin.size() > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); } } @@ -156,13 +235,22 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO REMOVE */ private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { + final ArrayList readsToRemove = new ArrayList(numElementsToRemove); + + if ( numElementsToRemove == 0 ) + return readsToRemove; + final int pileupSize = reads.size(); + if ( numElementsToRemove == pileupSize ) { + logAllReads(reads, log); + return reads; + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove); for ( int i = 0; i < pileupSize; i++ ) { if ( itemsToRemove.get(i) ) { final GATKSAMRecord read = reads.get(i); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100755 index 000000000..7c2f5619a --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(actualCounts, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } +}