Optimized the Allele Biased Downsampling: now it doesn't re-sort the pileup but just removes reads from the original one.
Added a small fix that slightly changed md5s.
This commit is contained in:
parent
08d2da9057
commit
cac439bc5e
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -78,54 +77,46 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
|
||||
// keep all of the reduced reads
|
||||
final ArrayList<PileupElement> reducedReadPileups = new ArrayList<PileupElement>();
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
for ( final PileupElement pe : pileup ) {
|
||||
// we do not want to remove a reduced read
|
||||
if ( pe.getRead().isReducedRead() ) {
|
||||
reducedReadPileups.add(pe);
|
||||
} else {
|
||||
if ( !pe.getRead().isReducedRead() ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
}
|
||||
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
@Override
|
||||
public int compare(PileupElement element1, PileupElement element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
elementsToKeep.addAll(reducedReadPileups);
|
||||
|
||||
// make a listing of allele counts
|
||||
final int[] alleleCounts = new int[4];
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
|
||||
// do smart down-sampling
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
// if we don't need to remove any reads, keep them all
|
||||
if ( alleleList.size() <= targetAlleleCounts[i] )
|
||||
elementsToKeep.addAll(alleleList);
|
||||
else
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||
// if we don't need to remove any reads, then don't
|
||||
if ( alleleList.size() > targetAlleleCounts[i] )
|
||||
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
|
||||
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
|
||||
for ( final PileupElement pe : pileup ) {
|
||||
if ( !readsToRemove.contains(pe) ) {
|
||||
readsToKeep.add(pe);
|
||||
}
|
||||
}
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
|
||||
}
|
||||
|
||||
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
||||
|
|
@ -189,37 +180,43 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||
*
|
||||
* @param elements original list of records
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @param log logging output
|
||||
* @return the list of pileup elements TO KEEP
|
||||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
if ( numElementsToRemove == 0 )
|
||||
return elements;
|
||||
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
|
||||
|
||||
// are there no elements to remove?
|
||||
if ( numElementsToRemove == 0 )
|
||||
return elementsToRemove;
|
||||
|
||||
// should we remove all of the elements?
|
||||
final int pileupSize = elements.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllElements(elements, log);
|
||||
return new ArrayList<PileupElement>(0);
|
||||
elementsToRemove.addAll(elements);
|
||||
return elementsToRemove;
|
||||
}
|
||||
|
||||
// create a bitset describing which elements to remove
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) )
|
||||
logRead(elements.get(i).getRead(), log);
|
||||
else
|
||||
elementsToKeep.add(elements.get(i));
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final T element = elements.get(i);
|
||||
logElement(element, log);
|
||||
elementsToRemove.add(element);
|
||||
}
|
||||
}
|
||||
|
||||
return elementsToKeep;
|
||||
return elementsToRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -253,65 +250,30 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
||||
|
||||
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
||||
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
}
|
||||
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||
*
|
||||
* @param reads original list of records
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @param log logging output
|
||||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
|
||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
|
||||
|
||||
if ( numElementsToRemove == 0 )
|
||||
return readsToRemove;
|
||||
|
||||
final int pileupSize = reads.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllReads(reads, log);
|
||||
return reads;
|
||||
}
|
||||
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final GATKSAMRecord read = reads.get(i);
|
||||
readsToRemove.add(read);
|
||||
logRead(read, log);
|
||||
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final T obj : elements ) {
|
||||
logElement(obj, log);
|
||||
}
|
||||
}
|
||||
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
|
||||
private static <T> void logElement(final T obj, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final PileupElement p : elements )
|
||||
logRead(p.getRead(), log);
|
||||
}
|
||||
}
|
||||
|
||||
private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
logRead(read, log);
|
||||
}
|
||||
}
|
||||
final GATKSAMRecord read;
|
||||
if ( obj instanceof PileupElement )
|
||||
read = ((PileupElement)obj).getRead();
|
||||
else
|
||||
read = (GATKSAMRecord)obj;
|
||||
|
||||
private static void logRead(final SAMRecord read, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e"));
|
||||
Arrays.asList("35479a79e1ce7c15493bd77e58cadcaa"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -238,12 +238,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -497,13 +497,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("02175dc9731aed92837ce0db78489fc0"));
|
||||
Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24");
|
||||
testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"1a034b7eb572e1b6f659d6e5d57b3e76");
|
||||
"d590c8d6d5e58d685401b65a23846893");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("31db0a2d9eb07f86e0a89f0d97169072"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue