Optimized the Allele Biased Downsampling: now it doesn't re-sort the pileup but just removes reads from the original one.
Added a small fix that slightly changed md5s.
This commit is contained in:
parent
08d2da9057
commit
cac439bc5e
|
|
@ -47,7 +47,6 @@
|
||||||
package org.broadinstitute.sting.gatk.downsampling;
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.pileup.*;
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
@ -78,54 +77,46 @@ public class AlleleBiasedDownsamplingUtils {
|
||||||
for ( int i = 0; i < 4; i++ )
|
for ( int i = 0; i < 4; i++ )
|
||||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||||
|
|
||||||
// keep all of the reduced reads
|
|
||||||
final ArrayList<PileupElement> reducedReadPileups = new ArrayList<PileupElement>();
|
|
||||||
|
|
||||||
// start by stratifying the reads by the alleles they represent at this position
|
// start by stratifying the reads by the alleles they represent at this position
|
||||||
for( final PileupElement pe : pileup ) {
|
for ( final PileupElement pe : pileup ) {
|
||||||
// we do not want to remove a reduced read
|
// we do not want to remove a reduced read
|
||||||
if ( pe.getRead().isReducedRead() ) {
|
if ( !pe.getRead().isReducedRead() ) {
|
||||||
reducedReadPileups.add(pe);
|
|
||||||
} else {
|
|
||||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||||
if ( baseIndex != -1 )
|
if ( baseIndex != -1 )
|
||||||
alleleStratifiedElements[baseIndex].add(pe);
|
alleleStratifiedElements[baseIndex].add(pe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
|
||||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
|
||||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
|
||||||
@Override
|
|
||||||
public int compare(PileupElement element1, PileupElement element2) {
|
|
||||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
|
||||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
elementsToKeep.addAll(reducedReadPileups);
|
|
||||||
|
|
||||||
// make a listing of allele counts
|
// make a listing of allele counts
|
||||||
final int[] alleleCounts = new int[4];
|
final int[] alleleCounts = new int[4];
|
||||||
for ( int i = 0; i < 4; i++ )
|
for ( int i = 0; i < 4; i++ )
|
||||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||||
|
|
||||||
// do smart down-sampling
|
// do smart down-sampling
|
||||||
|
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||||
|
|
||||||
|
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
|
||||||
for ( int i = 0; i < 4; i++ ) {
|
for ( int i = 0; i < 4; i++ ) {
|
||||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||||
// if we don't need to remove any reads, keep them all
|
// if we don't need to remove any reads, then don't
|
||||||
if ( alleleList.size() <= targetAlleleCounts[i] )
|
if ( alleleList.size() > targetAlleleCounts[i] )
|
||||||
elementsToKeep.addAll(alleleList);
|
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||||
else
|
|
||||||
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up pointers so memory can be garbage collected if needed
|
// clean up pointers so memory can be garbage collected if needed
|
||||||
for ( int i = 0; i < 4; i++ )
|
for ( int i = 0; i < 4; i++ )
|
||||||
alleleStratifiedElements[i].clear();
|
alleleStratifiedElements[i].clear();
|
||||||
|
|
||||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
|
||||||
|
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
|
||||||
|
for ( final PileupElement pe : pileup ) {
|
||||||
|
if ( !readsToRemove.contains(pe) ) {
|
||||||
|
readsToKeep.add(pe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
||||||
|
|
@ -189,37 +180,43 @@ public class AlleleBiasedDownsamplingUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
|
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||||
*
|
*
|
||||||
* @param elements original list of records
|
* @param elements original list of records
|
||||||
* @param numElementsToRemove the number of records to remove
|
* @param numElementsToRemove the number of records to remove
|
||||||
* @param log logging output
|
* @param log logging output
|
||||||
* @return the list of pileup elements TO KEEP
|
* @return the list of pileup elements TO REMOVE
|
||||||
*/
|
*/
|
||||||
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
|
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
|
||||||
if ( numElementsToRemove == 0 )
|
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
|
||||||
return elements;
|
|
||||||
|
|
||||||
|
// are there no elements to remove?
|
||||||
|
if ( numElementsToRemove == 0 )
|
||||||
|
return elementsToRemove;
|
||||||
|
|
||||||
|
// should we remove all of the elements?
|
||||||
final int pileupSize = elements.size();
|
final int pileupSize = elements.size();
|
||||||
if ( numElementsToRemove == pileupSize ) {
|
if ( numElementsToRemove == pileupSize ) {
|
||||||
logAllElements(elements, log);
|
logAllElements(elements, log);
|
||||||
return new ArrayList<PileupElement>(0);
|
elementsToRemove.addAll(elements);
|
||||||
|
return elementsToRemove;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// create a bitset describing which elements to remove
|
||||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||||
itemsToRemove.set(selectedIndex);
|
itemsToRemove.set(selectedIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
|
||||||
for ( int i = 0; i < pileupSize; i++ ) {
|
for ( int i = 0; i < pileupSize; i++ ) {
|
||||||
if ( itemsToRemove.get(i) )
|
if ( itemsToRemove.get(i) ) {
|
||||||
logRead(elements.get(i).getRead(), log);
|
final T element = elements.get(i);
|
||||||
else
|
logElement(element, log);
|
||||||
elementsToKeep.add(elements.get(i));
|
elementsToRemove.add(element);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return elementsToKeep;
|
return elementsToRemove;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -253,65 +250,30 @@ public class AlleleBiasedDownsamplingUtils {
|
||||||
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
||||||
|
|
||||||
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
||||||
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return readsToRemove;
|
return readsToRemove;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
|
||||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
if ( log != null ) {
|
||||||
*
|
for ( final T obj : elements ) {
|
||||||
* @param reads original list of records
|
logElement(obj, log);
|
||||||
* @param numElementsToRemove the number of records to remove
|
|
||||||
* @param log logging output
|
|
||||||
* @return the list of pileup elements TO REMOVE
|
|
||||||
*/
|
|
||||||
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
|
|
||||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
|
|
||||||
|
|
||||||
if ( numElementsToRemove == 0 )
|
|
||||||
return readsToRemove;
|
|
||||||
|
|
||||||
final int pileupSize = reads.size();
|
|
||||||
if ( numElementsToRemove == pileupSize ) {
|
|
||||||
logAllReads(reads, log);
|
|
||||||
return reads;
|
|
||||||
}
|
|
||||||
|
|
||||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
|
||||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
|
||||||
itemsToRemove.set(selectedIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
for ( int i = 0; i < pileupSize; i++ ) {
|
|
||||||
if ( itemsToRemove.get(i) ) {
|
|
||||||
final GATKSAMRecord read = reads.get(i);
|
|
||||||
readsToRemove.add(read);
|
|
||||||
logRead(read, log);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return readsToRemove;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
|
private static <T> void logElement(final T obj, final PrintStream log) {
|
||||||
if ( log != null ) {
|
if ( log != null ) {
|
||||||
for ( final PileupElement p : elements )
|
|
||||||
logRead(p.getRead(), log);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
|
final GATKSAMRecord read;
|
||||||
if ( log != null ) {
|
if ( obj instanceof PileupElement )
|
||||||
for ( final GATKSAMRecord read : reads )
|
read = ((PileupElement)obj).getRead();
|
||||||
logRead(read, log);
|
else
|
||||||
}
|
read = (GATKSAMRecord)obj;
|
||||||
}
|
|
||||||
|
|
||||||
private static void logRead(final SAMRecord read, final PrintStream log) {
|
|
||||||
if ( log != null ) {
|
|
||||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||||
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
|
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultipleSNPAlleles() {
|
public void testMultipleSNPAlleles() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||||
Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e"));
|
Arrays.asList("35479a79e1ce7c15493bd77e58cadcaa"));
|
||||||
executeTest("test Multiple SNP alleles", spec);
|
executeTest("test Multiple SNP alleles", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -238,12 +238,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOutputParameterAllConfident() {
|
public void testOutputParameterAllConfident() {
|
||||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
|
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOutputParameterAllSites() {
|
public void testOutputParameterAllSites() {
|
||||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4");
|
testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testOutputParameters(final String args, final String md5) {
|
private void testOutputParameters(final String args, final String md5) {
|
||||||
|
|
@ -497,13 +497,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testReducedBam() {
|
public void testReducedBam() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||||
Arrays.asList("02175dc9731aed92837ce0db78489fc0"));
|
Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
|
||||||
executeTest("test calling on a ReducedRead BAM", spec);
|
executeTest("test calling on a ReducedRead BAM", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReducedBamSNPs() {
|
public void testReducedBamSNPs() {
|
||||||
testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24");
|
testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
|
|
@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||||
"1a034b7eb572e1b6f659d6e5d57b3e76");
|
"d590c8d6d5e58d685401b65a23846893");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||||
|
|
@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
|
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("31db0a2d9eb07f86e0a89f0d97169072"));
|
||||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue