Optimized the Allele Biased Downsampling: now it doesn't re-sort the pileup but just removes reads from the original one.

Added a small fix that slightly changed md5s.
This commit is contained in:
Eric Banks 2013-01-18 11:17:31 -05:00
parent 08d2da9057
commit cac439bc5e
3 changed files with 52 additions and 90 deletions

View File

@ -47,7 +47,6 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -78,54 +77,46 @@ public class AlleleBiasedDownsamplingUtils {
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
// keep all of the reduced reads
final ArrayList<PileupElement> reducedReadPileups = new ArrayList<PileupElement>();
// start by stratifying the reads by the alleles they represent at this position
for( final PileupElement pe : pileup ) {
for ( final PileupElement pe : pileup ) {
// we do not want to remove a reduced read
if ( pe.getRead().isReducedRead() ) {
reducedReadPileups.add(pe);
} else {
if ( !pe.getRead().isReducedRead() ) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
}
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
@Override
public int compare(PileupElement element1, PileupElement element2) {
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
}
});
elementsToKeep.addAll(reducedReadPileups);
// make a listing of allele counts
final int[] alleleCounts = new int[4];
for ( int i = 0; i < 4; i++ )
alleleCounts[i] = alleleStratifiedElements[i].size();
// do smart down-sampling
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
// if we don't need to remove any reads, keep them all
if ( alleleList.size() <= targetAlleleCounts[i] )
elementsToKeep.addAll(alleleList);
else
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
// if we don't need to remove any reads, then don't
if ( alleleList.size() > targetAlleleCounts[i] )
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
}
// clean up pointers so memory can be garbage collected if needed
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i].clear();
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
for ( final PileupElement pe : pileup ) {
if ( !readsToRemove.contains(pe) ) {
readsToKeep.add(pe);
}
}
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
}
private static int scoreAlleleCounts(final int[] alleleCounts) {
@ -189,37 +180,43 @@ public class AlleleBiasedDownsamplingUtils {
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param elements original list of records
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO KEEP
* @return the list of pileup elements TO REMOVE
*/
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
if ( numElementsToRemove == 0 )
return elements;
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
// are there no elements to remove?
if ( numElementsToRemove == 0 )
return elementsToRemove;
// should we remove all of the elements?
final int pileupSize = elements.size();
if ( numElementsToRemove == pileupSize ) {
logAllElements(elements, log);
return new ArrayList<PileupElement>(0);
elementsToRemove.addAll(elements);
return elementsToRemove;
}
// create a bitset describing which elements to remove
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) )
logRead(elements.get(i).getRead(), log);
else
elementsToKeep.add(elements.get(i));
if ( itemsToRemove.get(i) ) {
final T element = elements.get(i);
logElement(element, log);
elementsToRemove.add(element);
}
}
return elementsToKeep;
return elementsToRemove;
}
/**
@ -253,65 +250,30 @@ public class AlleleBiasedDownsamplingUtils {
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
if ( alleleBin.size() > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
}
}
return readsToRemove;
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param reads original list of records
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO REMOVE
*/
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
if ( numElementsToRemove == 0 )
return readsToRemove;
final int pileupSize = reads.size();
if ( numElementsToRemove == pileupSize ) {
logAllReads(reads, log);
return reads;
}
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final GATKSAMRecord read = reads.get(i);
readsToRemove.add(read);
logRead(read, log);
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
if ( log != null ) {
for ( final T obj : elements ) {
logElement(obj, log);
}
}
return readsToRemove;
}
private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
private static <T> void logElement(final T obj, final PrintStream log) {
if ( log != null ) {
for ( final PileupElement p : elements )
logRead(p.getRead(), log);
}
}
private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
if ( log != null ) {
for ( final GATKSAMRecord read : reads )
logRead(read, log);
}
}
final GATKSAMRecord read;
if ( obj instanceof PileupElement )
read = ((PileupElement)obj).getRead();
else
read = (GATKSAMRecord)obj;
private static void logRead(final SAMRecord read, final PrintStream log) {
if ( log != null ) {
final SAMReadGroupRecord readGroup = read.getReadGroup();
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
}

View File

@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e"));
Arrays.asList("35479a79e1ce7c15493bd77e58cadcaa"));
executeTest("test Multiple SNP alleles", spec);
}
@ -238,12 +238,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4");
testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209");
}
private void testOutputParameters(final String args, final String md5) {
@ -497,13 +497,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("02175dc9731aed92837ce0db78489fc0"));
Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24");
testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
}
@Test

View File

@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"1a034b7eb572e1b6f659d6e5d57b3e76");
"d590c8d6d5e58d685401b65a23846893");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("31db0a2d9eb07f86e0a89f0d97169072"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}