Added allele biased down-sampling capabilities to the PerReadAlleleLikelihoodMap object, which means that both the UG and HC can use this functionality. Note that it's only available in protected, so GATK-lite users won't be allowed to enable it. Needs more testing.

2012-10-24 22:52:25 -04:00 · 2012-10-24 22:52:25 -04:00 · c6b57fffda
parent 9da7bbf689
commit c6b57fffda
12 changed files with 289 additions and 109 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2010.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMReadGroupRecord;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.pileup.*;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+
+import java.io.PrintStream;
+import java.util.*;
+
+public class AlleleBiasedDownsamplingUtils {
+
+    private static final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
+    static {
+        for ( int i = 0; i < 4; i++ )
+            alleleStratifiedElements[i] = new ArrayList<PileupElement>();
+    }
+
+    /**
+     * Computes an allele biased version of the given pileup
+     *
+     * @param pileup                    the original pileup
+     * @param downsamplingFraction      the fraction of total reads to remove per allele
+     * @param log                       logging output
+     * @return allele biased pileup
+     */
+    public synchronized static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
+        // special case removal of all or no reads
+        if ( downsamplingFraction <= 0.0 )
+            return pileup;
+        if ( downsamplingFraction >= 1.0 )
+            return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
+
+        // start by stratifying the reads by the alleles they represent at this position
+        for( final PileupElement pe : pileup ) {
+            final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
+            if ( baseIndex != -1 )
+                alleleStratifiedElements[baseIndex].add(pe);
+        }
+
+        // Down-sample *each* allele by the contamination fraction applied to the entire pileup.
+        // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
+        int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
+        final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
+            @Override
+            public int compare(PileupElement element1, PileupElement element2) {
+                final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
+                return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
+            }
+        });
+
+        for ( int i = 0; i < 4; i++ ) {
+            final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
+            if ( alleleList.size() <= numReadsToRemove )
+                logAllElements(alleleList, log);
+            else
+                elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
+        }
+
+        // clean up pointers so memory can be garbage collected if needed
+        for ( int i = 0; i < 4; i++ )
+            alleleStratifiedElements[i].clear();
+
+        return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
+    }
+
+    /**
+     * Performs allele biased down-sampling on a pileup and computes the list of elements to keep
+     *
+     * @param elements                  original list of records
+     * @param numElementsToRemove       the number of records to remove
+     * @param log                       logging output
+     * @return the list of pileup elements TO KEEP
+     */
+    private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
+        final int pileupSize = elements.size();
+        final BitSet itemsToRemove = new BitSet(pileupSize);
+        for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
+            itemsToRemove.set(selectedIndex);
+        }
+
+        ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
+        for ( int i = 0; i < pileupSize; i++ ) {
+            if ( itemsToRemove.get(i) )
+                logRead(elements.get(i).getRead(), log);
+            else
+                elementsToKeep.add(elements.get(i));
+        }
+
+        return elementsToKeep;
+    }
+
+    /**
+     * Computes reads to remove based on an allele biased down-sampling
+     *
+     * @param alleleReadMap             original list of records per allele
+     * @param downsamplingFraction      the fraction of total reads to remove per allele
+     * @param log                       logging output
+     * @return list of reads TO REMOVE from allele biased down-sampling
+     */
+    public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
+        int totalReads = 0;
+        for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
+            totalReads += reads.size();
+
+        // Down-sample *each* allele by the contamination fraction applied to the entire pileup.
+        int numReadsToRemove = (int)(totalReads * downsamplingFraction);
+        final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
+        for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
+            if ( reads.size() <= numReadsToRemove ) {
+                readsToRemove.addAll(reads);
+                logAllReads(reads, log);
+            } else {
+                readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
+            }
+        }
+
+        return readsToRemove;
+    }
+
+    /**
+     * Performs allele biased down-sampling on a pileup and computes the list of elements to remove
+     *
+     * @param reads                     original list of records
+     * @param numElementsToRemove       the number of records to remove
+     * @param log                       logging output
+     * @return the list of pileup elements TO REMOVE
+     */
+    private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
+        final int pileupSize = reads.size();
+        final BitSet itemsToRemove = new BitSet(pileupSize);
+        for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
+            itemsToRemove.set(selectedIndex);
+        }
+
+        ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
+        for ( int i = 0; i < pileupSize; i++ ) {
+            if ( itemsToRemove.get(i) ) {
+                final GATKSAMRecord read = reads.get(i);
+                readsToRemove.add(read);
+                logRead(read, log);
+            }
+        }
+
+        return readsToRemove;
+    }
+
+    private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
+        if ( log != null ) {
+            for ( final PileupElement p : elements )
+                logRead(p.getRead(), log);
+        }
+    }
+
+    private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
+        if ( log != null ) {
+            for ( final GATKSAMRecord read : reads )
+                logRead(read, log);
+        }
+    }
+
+    private static void logRead(final SAMRecord read, final PrintStream log) {
+        if ( log != null ) {
+            final SAMReadGroupRecord readGroup = read.getReadGroup();
+            log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
+        }
+    }
+}
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -59,7 +60,7 @@ public class ErrorModel  {

        boolean hasCalledAlleles = false;

-        final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
+        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
        if (refSampleVC != null) {

            for (Allele allele : refSampleVC.getAlleles()) {
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.sting.utils.fragments.FragmentCollection;
 import org.broadinstitute.sting.utils.fragments.FragmentUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
 import org.broadinstitute.sting.utils.pairhmm.PairHMM;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -424,7 +425,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                  : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
            if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }

-            final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
+            final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
            final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
            final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());

--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

+import java.io.PrintStream;
 import java.util.*;

 public class LikelihoodCalculationEngine {
@ -346,7 +347,9 @@ public class LikelihoodCalculationEngine {
    public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
                                                                                            final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
                                                                                            final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
-                                                                                            final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
+                                                                                            final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
+                                                                                            final double downsamplingFraction,
+                                                                                            final PrintStream downsamplingLog ) {
        final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
        final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
        for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
@ -370,6 +373,9 @@ public class LikelihoodCalculationEngine {
                }
            }

+            // down-sample before adding filtered reads
+            likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
+
            // add all filtered reads to the NO_CALL list because they weren't given any likelihoods
            for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
                // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
--- a/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java
@ -25,16 +25,43 @@
 package org.broadinstitute.sting.utils.genotyper;


+import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
 import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.variantcontext.Allele;

+import java.io.PrintStream;
 import java.util.*;

 public class AdvancedPerReadAlleleLikelihoodMap extends StandardPerReadAlleleLikelihoodMap implements ProtectedPackageSource {

-    public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) {
-        return this;
+    public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
+        return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
    }
- }
+
+    public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
+        // special case removal of all or no reads
+        if ( downsamplingFraction <= 0.0 )
+            return;
+        if ( downsamplingFraction >= 1.0 ) {
+            likelihoodReadMap.clear();
+            return;
+        }
+
+        // start by stratifying the reads by the alleles they represent at this position
+        final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
+        for ( Allele allele : alleles )
+            alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
+
+        for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
+            final Allele bestAllele = getMostLikelyAllele(entry.getValue());
+            alleleReadMap.get(bestAllele).add(entry.getKey());
+        }
+
+        // compute the reads to remove and actually remove them
+        final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
+        for ( final GATKSAMRecord read : readsToRemove )
+            likelihoodReadMap.remove(read);
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

 import java.io.File;
+import java.io.PrintStream;

 /**
 * Created with IntelliJ IDEA.
@ -63,26 +64,23 @@ public class StandardCallerArgumentCollection {
    public int MAX_ALTERNATE_ALLELES = 6;

    /**
-     * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
-     * then only this many alleles will be used.  Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
-     * scales exponentially based on the number of alternate alleles.  Unless there is a good reason to change the default value, we highly recommend
-     * that you not play around with this parameter.
-     *
-     * This argument has been retired in GATK 2.2.  Please specify just maxAltAlleles from now on
+     * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
     */
-    @Deprecated
-    @Hidden
-    @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2.  Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false)
-    public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1;
+    @Advanced
+    @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
+    public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();

    /**
     * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
     * Basically, it will ignore the contamination fraction of reads for each alternate allele.  So if the pileup contains N total bases, then we
     * will try to remove (N * contamination fraction) bases for each alternate allele.
     */
-    @Hidden
    @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
-    public double CONTAMINATION_PERCENTAGE = 0.0;
+    public double CONTAMINATION_FRACTION = 0.0;
+
+    @Hidden
+    @Argument(shortName = "logRemovedReadsFromContaminationFiltering", doc="contaminationLog", required=false)
+    public PrintStream contaminationLog = null;

    @Hidden
    @Argument(shortName = "logExactCalls", doc="x", required=false)
@ -96,19 +94,12 @@ public class StandardCallerArgumentCollection {
        this.GenotypingMode = SCAC.GenotypingMode;
        this.heterozygosity = SCAC.heterozygosity;
        this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
-        this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
        this.OutputMode = SCAC.OutputMode;
        this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
        this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
-        this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE;
+        this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
+        this.contaminationLog = SCAC.contaminationLog;
        this.exactCallsLog = SCAC.exactCallsLog;
        this.AFmodel = SCAC.AFmodel;
    }
-
-    /**
-     * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
-     */
-    @Advanced
-    @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
-    public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Haplotype;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.variantcontext.*;
@ -81,7 +82,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
                                         final List<Allele> allAllelesToUse,
                                         final boolean useBAQedPileup,
                                         final GenomeLocParser locParser,
-                                         final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
+                                         final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {

        GenomeLoc loc = ref.getLocus();
 //        if (!ref.getLocus().equals(lastSiteVisited)) {
@ -118,12 +119,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood

            if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
                // no likelihoods have been computed for this sample at this site
-                perReadAlleleLikelihoodMap.put(sample.getKey(), org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
+                perReadAlleleLikelihoodMap.put(sample.getKey(), PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
            }
            final ReadBackedPileup pileup = context.getBasePileup();
            if (pileup != null) {
                final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
-                final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()));
+                final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.CONTAMINATION_FRACTION, UAC.contaminationLog);
                b.PL(genotypeLikelihoods);
                b.DP(getFilteredDepth(pileup));
                genotypes.add(b.make());
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -48,13 +49,13 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
    private final boolean useAlleleFromVCF;

    private final double[] likelihoodSums = new double[4];
-    private final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
+
+    private final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;

    protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
        super(UAC, logger);
        useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
-        for ( int i = 0; i < 4; i++ )
-            alleleStratifiedElements[i] = new ArrayList<PileupElement>();
+        perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
    }

    public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
@ -64,9 +65,9 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
                                         final List<Allele> allAllelesToUse,
                                         final boolean useBAQedPileup,
                                         final GenomeLocParser locParser,
-                                         final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
+                                         final Map<String, PerReadAlleleLikelihoodMap> sampleLikelihoodMap) {

-        perReadAlleleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data
+        sampleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data

        final byte refBase = ref.getBase();
        final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
@ -79,8 +80,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
        ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
        for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
            ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
-            if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 )
-                pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE);
+            if ( UAC.CONTAMINATION_FRACTION > 0.0 )
+                pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, UAC.CONTAMINATION_FRACTION, UAC.contaminationLog);
            if ( useBAQedPileup )
                pileup = createBAQedPileup(pileup);

@ -203,42 +204,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
        return allelesToUse;
    }

-    public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) {
-        // special case removal of all reads
-        if ( contaminationPercentage >= 1.0 )
-            return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
-
-        // start by stratifying the reads by the alleles they represent at this position
-        for( final PileupElement pe : pileup ) {
-            final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
-            if ( baseIndex != -1 )
-                alleleStratifiedElements[baseIndex].add(pe);
-        }
-
-        // Down-sample *each* allele by the contamination fraction applied to the entire pileup.
-        // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
-        int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage);
-        final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
-            @Override
-            public int compare(PileupElement element1, PileupElement element2) {
-                final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
-                return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
-            }
-        });
-
-        for ( int i = 0; i < 4; i++ ) {
-            final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
-            if ( alleleList.size() > numReadsToRemove )
-                elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove));
-        }
-
-        // clean up pointers so memory can be garbage collected if needed
-        for ( int i = 0; i < 4; i++ )
-            alleleStratifiedElements[i].clear();
-
-        return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
-    }
-
    public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
        final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
        for( final PileupElement PE : pileup ) {
@ -257,22 +222,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
        public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
    }

-    private List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove) {
-        final int pileupSize = elements.size();
-        final BitSet itemsToRemove = new BitSet(pileupSize);
-        for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
-            itemsToRemove.set(selectedIndex);
-        }
-
-        ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
-        for ( int i = 0; i < pileupSize; i++ ) {
-            if ( !itemsToRemove.get(i) )
-                elementsToKeep.add(elements.get(i));
-        }
-
-        return elementsToKeep;
-    }
-
    private static class SampleGenotypeData {

        public final String name;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -153,19 +153,19 @@ public class UnifiedGenotyperEngine {
    }


-        /**
-        * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
-        *
-        * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
-        * for every sample in allSamples.  If it's null there's no such guarentee.  Providing this
-        * argument is critical when the resulting calls will be written to a VCF file.
-        *
-        * @param tracker    the meta data tracker
-        * @param refContext the reference base
-        * @param rawContext contextual information around the locus
-        * @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
-        * @return the VariantCallContext object
-        */
+    /**
+     * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
+     *
+     * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
+     * for every sample in allSamples.  If it's null there's no such guarentee.  Providing this
+     * argument is critical when the resulting calls will be written to a VCF file.
+     *
+     * @param tracker    the meta data tracker
+     * @param refContext the reference base
+     * @param rawContext contextual information around the locus
+     * @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
+     * @return the VariantCallContext object
+     */
    public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
                                                                     final ReferenceContext refContext,
                                                                     final AlignmentContext rawContext,
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;

+import java.io.PrintStream;
 import java.util.Arrays;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -187,11 +188,14 @@ public class PairHMMIndelErrorModel {
                                                                        final LinkedHashMap<Allele, Haplotype> haplotypeMap,
                                                                        final ReferenceContext ref,
                                                                        final int eventLength,
-                                                                        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
+                                                                        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
+                                                                        final double downsamplingFraction,
+                                                                        final PrintStream downsamplingLog) {
        final int numHaplotypes = haplotypeMap.size();

        final int readCounts[] = new int[pileup.getNumberOfElements()];
        final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
+        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
        return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
        
    }
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@ -28,9 +28,11 @@ package org.broadinstitute.sting.utils.genotyper;
 import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.variantcontext.Allele;

+import java.io.PrintStream;
 import java.lang.reflect.Constructor;
 import java.util.*;

@ -41,6 +43,9 @@ public abstract class PerReadAlleleLikelihoodMap {
    protected List<Allele> alleles;
    protected Map<GATKSAMRecord,Map<Allele,Double>> likelihoodReadMap;

+    public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log);
+    public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log);
+
    public void add(GATKSAMRecord read, Allele a, Double likelihood) {
        Map<Allele,Double> likelihoodMap;
        if (likelihoodReadMap.containsKey(read)){
@ -118,8 +123,6 @@ public abstract class PerReadAlleleLikelihoodMap {
        return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
    }

-    public abstract PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction);
-
    public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {
        final Class PerReadAlleleLikelihoodMapClass = GATKLiteUtils.getProtectedClassIfAvailable(PerReadAlleleLikelihoodMap.class);
        try {
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java
@ -26,10 +26,11 @@ package org.broadinstitute.sting.utils.genotyper;


 import org.broadinstitute.sting.utils.classloader.PublicPackageSource;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.variantcontext.Allele;

+import java.io.PrintStream;
 import java.util.*;

 public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodMap implements PublicPackageSource {
@ -40,5 +41,6 @@ public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodM
    }

    // not implemented in the standard version
-    public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) { return this; }
- }
+    public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {}
+    public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { return pileup; }
+}