From ba2c3b57edff4f8b4186ffacd499ba59794b0f7c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 24 Apr 2013 11:52:26 -0400 Subject: [PATCH] Extended the allele-biased down-sampling functionality to handle reduced reads. Note that this works only in the case of pileups (i.e. coming from UG); allele-biased down-sampling for RR just cannot work for haplotypes. Added lots of unit tests for new functionality. --- .../StandardCallerArgumentCollection.java | 6 - ...elGenotypeLikelihoodsCalculationModel.java | 2 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 10 +- .../indels/PairHMMIndelErrorModel.java | 6 +- ...dGenotyperReducedReadsIntegrationTest.java | 6 +- .../PerReadAlleleLikelihoodMapUnitTest.java | 2 +- .../arguments/GATKArgumentCollection.java | 5 +- .../AlleleBiasedDownsamplingUtils.java | 181 +++++++++++------- .../genotyper/PerReadAlleleLikelihoodMap.java | 19 +- .../sting/utils/pileup/PileupElement.java | 15 +- .../sting/utils/sam/GATKSAMRecord.java | 35 +++- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 56 +++++- .../utils/sam/GATKSAMRecordUnitTest.java | 34 +++- 14 files changed, 268 insertions(+), 111 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 03698489c..63b8717c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; import java.util.Collections; import java.util.Map; @@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); - @Hidden - @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false) - public PrintStream contaminationLog = null; - @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; @@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection { this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE; - this.contaminationLog = SCAC.contaminationLog; this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 8a766ba48..c6e9ea379 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey())); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 7d2f794ec..ce5f94478 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); final Double contamination = UAC.getSampleContamination().get(sample.getKey()); if( contamination > 0.0 ) //no need to enter if no contamination reduction - pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog); + pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination); if ( useBAQedPileup ) pileup = createBAQedPileup(pileup); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 5fe98649f..419ea378f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; -import java.io.PrintStream; import java.util.*; public class GenotypingEngine { @@ -196,13 +195,13 @@ public class GenotypingEngine { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION ); final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = call; @@ -406,8 +405,7 @@ public class GenotypingEngine { // BUGBUG: ugh, too complicated protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, final Map> alleleMapper, - final double downsamplingFraction, - final PrintStream downsamplingLog ) { + final double downsamplingFraction ) { final Map alleleReadMap = new LinkedHashMap(); for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample @@ -424,7 +422,7 @@ public class GenotypingEngine { perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); } } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 5702fff2a..93a015718 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.PrintStream; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel { final ReferenceContext ref, final int eventLength, final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final double downsamplingFraction, - final PrintStream downsamplingLog) { + final double downsamplingFraction) { final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index 21b7d0f86..c3597b6aa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8")); + Arrays.asList("f82389f2bc6d3f932a36be65b60af648")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6"); + testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16"); + testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515"); } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 6ca49d3e5..9530ea41f 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10); Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1); Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10))); Map> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index e98dcfe9e..8d1fa4638 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -104,8 +104,9 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.") - public boolean disableRandomization = false; + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.") + public boolean disableDithering = false; @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false) public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 26e9febe7..fb7a16bfd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.downsampling; -import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.io.IOException; -import java.io.PrintStream; import java.util.*; import org.apache.log4j.Logger; public class AlleleBiasedDownsamplingUtils { + // define this class so that we can use Java generics below + private final static class PileupElementList extends ArrayList {} + /** * Computes an allele biased version of the given pileup * * @param pileup the original pileup * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return allele biased pileup */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return pileup; if ( downsamplingFraction >= 1.0 ) return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new ArrayList(); + alleleStratifiedElements[i] = new PileupElementList(); // start by stratifying the reads by the alleles they represent at this position + boolean sawReducedRead = false; for ( final PileupElement pe : pileup ) { - // we do not want to remove a reduced read - if ( !pe.getRead().isReducedRead() ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } + if ( pe.getRead().isReducedRead() ) + sawReducedRead = true; + + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); } - // make a listing of allele counts - final int[] alleleCounts = new int[4]; - for ( int i = 0; i < 4; i++ ) - alleleCounts[i] = alleleStratifiedElements[i].size(); + // make a listing of allele counts and calculate the total count + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead); + final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); // do smart down-sampling - int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor + final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); final HashSet readsToRemove = new HashSet(numReadsToRemove); for ( int i = 0; i < 4; i++ ) { - final ArrayList alleleList = alleleStratifiedElements[i]; + final PileupElementList alleleList = alleleStratifiedElements[i]; // if we don't need to remove any reads, then don't - if ( alleleList.size() > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); } - // clean up pointers so memory can be garbage collected if needed - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i].clear(); - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(pileup.getNumberOfElements() - numReadsToRemove); + final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); for ( final PileupElement pe : pileup ) { if ( !readsToRemove.contains(pe) ) { readsToKeep.add(pe); @@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); } + /** + * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) + * + * @param alleleStratifiedElements pileup elements stratified by allele + * @param sawReducedRead is at least one read a reduced read? + * @return non-null int array representing allele counts + */ + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) { + final int[] alleleCounts = new int[alleleStratifiedElements.length]; + for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { + if ( !sawReducedRead ) { + alleleCounts[i] = alleleStratifiedElements[i].size(); + } else { + for ( final PileupElement pe : alleleStratifiedElements[i] ) + alleleCounts[i] += pe.getRepresentativeCount(); + } + } + return alleleCounts; + } + private static int scoreAlleleCounts(final int[] alleleCounts) { if ( alleleCounts.length < 2 ) return 0; @@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils { } /** - * Computes an allele biased version of the given pileup + * Computes an allele biased version of the allele counts for a given pileup * - * @param alleleCounts the original pileup - * @param numReadsToRemove fraction of total reads to remove per allele - * @return allele biased pileup + * @param alleleCounts the allele counts for the original pileup + * @param numReadsToRemove number of total reads to remove per allele + * @return non-null array of new counts needed per allele */ protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { final int numAlleles = alleleCounts.length; @@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils { /** * Performs allele biased down-sampling on a pileup and computes the list of elements to remove * - * @param elements original list of records + * @param elements original list of pileup elements + * @param originalElementCount original count of elements (taking reduced reads into account) * @param numElementsToRemove the number of records to remove - * @param log logging output * @return the list of pileup elements TO REMOVE */ - private static List downsampleElements(final List elements, final int numElementsToRemove, final PrintStream log) { - ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - + protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { // are there no elements to remove? if ( numElementsToRemove == 0 ) - return elementsToRemove; + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); // should we remove all of the elements? - final int pileupSize = elements.size(); - if ( numElementsToRemove == pileupSize ) { - logAllElements(elements, log); + if ( numElementsToRemove >= originalElementCount ) { elementsToRemove.addAll(elements); return elementsToRemove; } // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(pileupSize); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - for ( int i = 0; i < pileupSize; i++ ) { - if ( itemsToRemove.get(i) ) { - final T element = elements.get(i); - logElement(element, log); + int currentBitSetIndex = 0; + for ( final PileupElement element : elements ) { + + final int representativeCount = element.getRepresentativeCount(); + + // if it's a reduced read, we need to be smart about how we down-sample + if ( representativeCount > 1 ) { + // count how many bits are set over the span represented by this read + int setBits = 0; + for ( int i = 0; i < representativeCount; i++ ) + setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0; + + // remove that count from the count of the reduced read + if ( setBits == representativeCount ) + elementsToRemove.add(element); + else + element.adjustRepresentativeCount(-1 * setBits); + } + // otherwise it's trivial: remove if the corresponding bit is set + else if ( itemsToRemove.get(currentBitSetIndex++) ) { elementsToRemove.add(element); } } @@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils { * * @param alleleReadMap original list of records per allele * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return list of reads TO REMOVE from allele biased down-sampling */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction, final PrintStream log) { + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { int totalReads = 0; for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); @@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils { final List alleles = new ArrayList(alleleReadMap.keySet()); alleles.remove(Allele.NO_CALL); // ignore the no-call bin final int numAlleles = alleles.size(); + + // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead final int[] alleleCounts = new int[numAlleles]; for ( int i = 0; i < numAlleles; i++ ) alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); @@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils { final List readsToRemove = new ArrayList(numReadsToRemove); for ( int i = 0; i < numAlleles; i++ ) { - final List alleleBin = alleleReadMap.get(alleles.get(i)); - - if ( alleleBin.size() > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); } } return readsToRemove; } - private static void logAllElements(final List elements, final PrintStream log) { - if ( log != null ) { - for ( final T obj : elements ) { - logElement(obj, log); - } + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List reads, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + final int originalElementCount = reads.size(); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(reads); + return elementsToRemove; } - } - private static void logElement(final T obj, final PrintStream log) { - if ( log != null ) { - - final GATKSAMRecord read; - if ( obj instanceof PileupElement ) - read = ((PileupElement)obj).getRead(); - else - read = (GATKSAMRecord)obj; - - final SAMReadGroupRecord readGroup = read.getReadGroup(); - log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); } - } + int currentBitSetIndex = 0; + for ( final GATKSAMRecord read : reads ) { + if ( read.isReducedRead() ) + throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords"); + + if ( itemsToRemove.get(currentBitSetIndex++) ) + elementsToRemove.add(read); + } + + return elementsToRemove; + } /** * Create sample-contamination maps from file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 8134b1257..150e24c51 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.PrintStream; import java.util.*; /** @@ -44,9 +42,8 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class); - protected List alleles; - protected Map> likelihoodReadMap; + protected final List alleles; + protected final Map> likelihoodReadMap; public PerReadAlleleLikelihoodMap() { likelihoodReadMap = new LinkedHashMap>(); @@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap { } - public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { - return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log); + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); } /** * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. - * @param log - a PrintStream to log the removed reads to (passed through to the utility function) */ - public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) { + public void performPerAlleleDownsampling(final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return; @@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap { final Map> alleleReadMap = getAlleleStratifiedReadMap(); // compute the reads to remove and actually remove them - final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log); + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); for ( final GATKSAMRecord read : readsToRemove ) likelihoodReadMap.remove(read); } @@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap { alleleReadMap.put(allele, new ArrayList()); for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // do not remove reduced reads! + // TODO -- come up with a strategy for down-sampling reduced reads + // Currently we are unable to remove reduced reads because their representative base count differs throughout the read if ( !entry.getKey().isReducedRead() ) { final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); if ( bestAllele.isInformative() ) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 51753ca5e..ba5dee34c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -303,7 +303,7 @@ public class PileupElement implements Comparable { * this being a reduced read and a deletion, we return the average number of elements between the left * and right elements to the deletion. We assume the deletion to be left aligned. * - * @return + * @return the representative count */ public int getRepresentativeCount() { if (read.isReducedRead()) { @@ -318,6 +318,19 @@ public class PileupElement implements Comparable { } } + /** + * Adjusts the representative count of this pileup element. + * Throws an exception if this element does not represent a reduced read. + * + * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) + */ + public void adjustRepresentativeCount(final int adjustmentFactor) { + if ( read.isReducedRead() ) + read.adjustReducedCount(offset, adjustmentFactor); + else + throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced"); + } + /** * Get the cigar element aligning this element to the genome * @return a non-null CigarElement diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 0e672b3d7..e4a2bed44 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord { * @return the number of bases corresponding to the i'th base of the reduced read */ public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + return reducedCounts[i]; + } + + /** + * Sets the number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @param count the new count + */ + public final void setReducedCount(final int i, final byte count) { + if ( count < 0 ) + throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + reducedCounts[i] = count; + setReducedReadCountsTag(reducedCounts); + } + + /** + * Sets the number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @param adjustmentFactor how much to add/subtract to the current count + */ + public final void adjustReducedCount(final int i, final int adjustmentFactor) { + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor); + setReducedCount(i, newCount); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java index 23b940491..6314d4681 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -25,18 +25,21 @@ package org.broadinstitute.sting.gatk.downsampling; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; /** @@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { return true; } + @DataProvider(name = "BiasedDownsamplingTest") + public Object[][] makeBiasedDownsamplingTest() { + final List tests = new LinkedList(); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) { + for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) { + for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) { + if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount ) + continue; + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalNormalCount + originalReducedCount ) + tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BiasedDownsamplingTest") + public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) { + + final LinkedList elements = new LinkedList(); + for ( int i = 0; i < originalNormalCount; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + if ( originalReducedCount > 0 ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount}); + elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove); + int pileupCount = 0; + for ( final PileupElement pe : elements ) // reduced reads may have gotten modified + pileupCount += pe.getRepresentativeCount(); + for ( final PileupElement pe : result ) + pileupCount -= pe.getRepresentativeCount(); + + Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove); + } @Test public void testLoadContaminationFileDetails(){ diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 57a7946ae..06cdb366e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest { final static String BASES = "ACTG"; final static String QUALS = "!+5?"; final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets @BeforeClass public void init() { @@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS); } @Test @@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest { } } + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetReducedCountOnNormalRead() { + read.getReducedCount(0); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetReducedTagOnNormalRead() { + read.setReducedCount(0, (byte)2); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetReducedCountToNegativeNumber() { + reducedRead.setReducedCount(0, (byte)1000); + } + + @Test + public void testSetReducedTagOnReducedRead() { + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final byte newCount = (byte)i; + reducedRead.setReducedCount(i, newCount); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final int newCount = reducedRead.getReducedCount(i) + i; + reducedRead.adjustReducedCount(i, i); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + } + @Test public void testReducedReadPileupElement() { PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);