Extended the allele-biased down-sampling functionality to handle reduced reads.

Note that this works only in the case of pileups (i.e. coming from UG); allele-biased down-sampling for RR just cannot work for haplotypes. Added lots of unit tests for new functionality.
2013-04-24 11:52:26 -04:00 · 2013-04-24 11:52:26 -04:00 · ba2c3b57ed
parent b749f06ba6
commit ba2c3b57ed
14 changed files with 268 additions and 111 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
 import org.broadinstitute.variant.variantcontext.VariantContext;

 import java.io.File;
-import java.io.PrintStream;
 import java.util.Collections;
 import java.util.Map;

@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection {
    @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
    public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();

-    @Hidden
-    @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
-    public PrintStream contaminationLog = null;
-
    @Hidden
    @Argument(shortName = "logExactCalls", doc="x", required=false)
    public File exactCallsLog = null;
@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection {
        this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
        this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
        this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
-        this.contaminationLog = SCAC.contaminationLog;
        this.exactCallsLog = SCAC.exactCallsLog;
        this.sampleContamination=SCAC.sampleContamination;
        this.AFmodel = SCAC.AFmodel;
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
            final ReadBackedPileup pileup = context.getBasePileup();
            if (pileup != null) {
                final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
-                final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog);
+                final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
                b.PL(genotypeLikelihoods);
                b.DP(getFilteredDepth(pileup));
                genotypes.add(b.make());
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
            ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
            final Double contamination =  UAC.getSampleContamination().get(sample.getKey());
            if( contamination > 0.0 ) //no need to enter if no contamination reduction
-                pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog);
+                pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination);
            if ( useBAQedPileup )
                pileup = createBAQedPileup(pileup);

--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
 import org.broadinstitute.variant.variantcontext.*;

-import java.io.PrintStream;
 import java.util.*;

 public class GenotypingEngine {
@ -196,13 +195,13 @@ public class GenotypingEngine {
                    logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
                }

-                final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
+                final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );

                final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
                final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
                if( call != null ) {
                    final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
-                            convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) );
+                            convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
                    final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );

                    VariantContext annotatedCall = call;
@ -406,8 +405,7 @@ public class GenotypingEngine {
    // BUGBUG: ugh, too complicated
    protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
                                                                                              final Map<Allele, List<Haplotype>> alleleMapper,
-                                                                                              final double downsamplingFraction,
-                                                                                              final PrintStream downsamplingLog ) {
+                                                                                              final double downsamplingFraction ) {

        final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
        for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
@ -424,7 +422,7 @@ public class GenotypingEngine {
                    perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
                }
            }
-            perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
+            perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
            alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
        }

--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.variant.variantcontext.Allele;

-import java.io.PrintStream;
 import java.util.Arrays;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
                                                                        final ReferenceContext ref,
                                                                        final int eventLength,
                                                                        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
-                                                                        final double downsamplingFraction,
-                                                                        final PrintStream downsamplingLog) {
+                                                                        final double downsamplingFraction) {
        final int numHaplotypes = haplotypeMap.size();

        final int readCounts[] = new int[pileup.getNumberOfElements()];
        final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
-        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
+        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
        return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
        
    }
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
    public void testReducedBam() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
+                Arrays.asList("f82389f2bc6d3f932a36be65b60af648"));
        executeTest("test calling on a ReducedRead BAM", spec);
    }

    @Test
    public void testReducedBamSNPs() {
-        testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6");
+        testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b");
    }

    @Test
    public void testReducedBamINDELs() {
-        testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
+        testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515");
    }


--- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {

        Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10);
        Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60);
-        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null);
+        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1);
        Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10)));

        Map<Allele,List<GATKSAMRecord>> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap();
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@ -104,8 +104,9 @@ public class GATKArgumentCollection {
    @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
    public boolean nonDeterministicRandomSeed = false;

-    @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
-    public boolean disableRandomization = false;
+    @Hidden
+    @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.")
+    public boolean disableDithering = false;

    @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
    public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
@ -25,7 +25,6 @@

 package org.broadinstitute.sting.gatk.downsampling;

-import net.sf.samtools.SAMReadGroupRecord;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.collections.DefaultHashMap;
 import org.broadinstitute.sting.utils.exceptions.StingException;
@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele;

 import java.io.File;
 import java.io.IOException;
-import java.io.PrintStream;
 import java.util.*;

 import org.apache.log4j.Logger;

 public class AlleleBiasedDownsamplingUtils {

+    // define this class so that we can use Java generics below
+    private final static class PileupElementList extends ArrayList<PileupElement> {}
+
    /**
     * Computes an allele biased version of the given pileup
     *
     * @param pileup                    the original pileup
     * @param downsamplingFraction      the fraction of total reads to remove per allele
-     * @param log                       logging output
     * @return allele biased pileup
     */
-    public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
+    public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
        // special case removal of all or no reads
        if ( downsamplingFraction <= 0.0 )
            return pileup;
        if ( downsamplingFraction >= 1.0 )
            return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());

-        final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
+        final PileupElementList[] alleleStratifiedElements = new PileupElementList[4];
        for ( int i = 0; i < 4; i++ )
-            alleleStratifiedElements[i] = new ArrayList<PileupElement>();
+            alleleStratifiedElements[i] = new PileupElementList();

        // start by stratifying the reads by the alleles they represent at this position
+        boolean sawReducedRead = false;
        for ( final PileupElement pe : pileup ) {
-            // we do not want to remove a reduced read
-            if ( !pe.getRead().isReducedRead() ) {
-                final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
-                if ( baseIndex != -1 )
-                    alleleStratifiedElements[baseIndex].add(pe);
-            }
+            if ( pe.getRead().isReducedRead() )
+                sawReducedRead = true;
+
+            final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
+            if ( baseIndex != -1 )
+                alleleStratifiedElements[baseIndex].add(pe);
        }

-        // make a listing of allele counts
-        final int[] alleleCounts = new int[4];
-        for ( int i = 0; i < 4; i++ )
-            alleleCounts[i] = alleleStratifiedElements[i].size();
+        // make a listing of allele counts and calculate the total count
+        final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead);
+        final int totalAlleleCount = (int)MathUtils.sum(alleleCounts);

        // do smart down-sampling
-        int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
+        final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor
        final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);

        final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
        for ( int i = 0; i < 4; i++ ) {
-            final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
+            final PileupElementList alleleList = alleleStratifiedElements[i];
            // if we don't need to remove any reads, then don't
-            if ( alleleList.size() > targetAlleleCounts[i] )
-                readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
+            if ( alleleCounts[i] > targetAlleleCounts[i] )
+                readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i]));
        }

-        // clean up pointers so memory can be garbage collected if needed
-        for ( int i = 0; i < 4; i++ )
-            alleleStratifiedElements[i].clear();
-
        // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
-        final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
+        final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(totalAlleleCount - numReadsToRemove);
        for ( final PileupElement pe : pileup ) {
            if ( !readsToRemove.contains(pe) ) {
                readsToKeep.add(pe);
@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils {
        return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
    }

+    /**
+     * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present)
+     *
+     * @param alleleStratifiedElements       pileup elements stratified by allele
+     * @param sawReducedRead                 is at least one read a reduced read?
+     * @return non-null int array representing allele counts
+     */
+    private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) {
+        final int[] alleleCounts = new int[alleleStratifiedElements.length];
+        for ( int i = 0; i < alleleStratifiedElements.length; i++ ) {
+            if ( !sawReducedRead ) {
+                alleleCounts[i] = alleleStratifiedElements[i].size();
+            } else {
+                for ( final PileupElement pe : alleleStratifiedElements[i] )
+                    alleleCounts[i] += pe.getRepresentativeCount();
+            }
+        }
+        return alleleCounts;
+    }
+
    private static int scoreAlleleCounts(final int[] alleleCounts) {
        if ( alleleCounts.length < 2 )
            return 0;
@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils {
    }

    /**
-     * Computes an allele biased version of the given pileup
+     * Computes an allele biased version of the allele counts for a given pileup
     *
-     * @param alleleCounts              the original pileup
-     * @param numReadsToRemove          fraction of total reads to remove per allele
-     * @return allele biased pileup
+     * @param alleleCounts              the allele counts for the original pileup
+     * @param numReadsToRemove          number of total reads to remove per allele
+     * @return non-null array of new counts needed per allele
     */
    protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
        final int numAlleles = alleleCounts.length;
@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils {
    /**
     * Performs allele biased down-sampling on a pileup and computes the list of elements to remove
     *
-     * @param elements                  original list of records
+     * @param elements                  original list of pileup elements
+     * @param originalElementCount      original count of elements (taking reduced reads into account)
     * @param numElementsToRemove       the number of records to remove
-     * @param log                       logging output
     * @return the list of pileup elements TO REMOVE
     */
-    private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
-        ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
-
+    protected static List<PileupElement> downsampleElements(final List<PileupElement> elements, final int originalElementCount, final int numElementsToRemove) {
        // are there no elements to remove?
        if ( numElementsToRemove == 0 )
-            return elementsToRemove;
+            return Collections.<PileupElement>emptyList();
+
+        final ArrayList<PileupElement> elementsToRemove = new ArrayList<PileupElement>(numElementsToRemove);

        // should we remove all of the elements?
-        final int pileupSize = elements.size();
-        if ( numElementsToRemove == pileupSize ) {
-            logAllElements(elements, log);
+        if ( numElementsToRemove >= originalElementCount ) {
            elementsToRemove.addAll(elements);
            return elementsToRemove;
        }

        // create a bitset describing which elements to remove
-        final BitSet itemsToRemove = new BitSet(pileupSize);
-        for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
+        final BitSet itemsToRemove = new BitSet(originalElementCount);
+        for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
            itemsToRemove.set(selectedIndex);
        }

-        for ( int i = 0; i < pileupSize; i++ ) {
-            if ( itemsToRemove.get(i) ) {
-                final T element = elements.get(i);
-                logElement(element, log);
+        int currentBitSetIndex = 0;
+        for ( final PileupElement element : elements ) {
+
+            final int representativeCount = element.getRepresentativeCount();
+
+            // if it's a reduced read, we need to be smart about how we down-sample
+            if ( representativeCount > 1 ) {
+                // count how many bits are set over the span represented by this read
+                int setBits = 0;
+                for ( int i = 0; i < representativeCount; i++ )
+                    setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0;
+
+                // remove that count from the count of the reduced read
+                if ( setBits == representativeCount )
+                    elementsToRemove.add(element);
+                else
+                    element.adjustRepresentativeCount(-1 * setBits);
+            }
+            // otherwise it's trivial: remove if the corresponding bit is set
+            else if ( itemsToRemove.get(currentBitSetIndex++) ) {
                elementsToRemove.add(element);
            }
        }
@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils {
     *
     * @param alleleReadMap             original list of records per allele
     * @param downsamplingFraction      the fraction of total reads to remove per allele
-     * @param log                       logging output
     * @return list of reads TO REMOVE from allele biased down-sampling
     */
-    public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
+    public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) {
        int totalReads = 0;
        for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
            totalReads += reads.size();
@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils {
        final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
        alleles.remove(Allele.NO_CALL);    // ignore the no-call bin
        final int numAlleles = alleles.size();
+
+        // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead
        final int[] alleleCounts = new int[numAlleles];
        for ( int i = 0; i < numAlleles; i++ )
            alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils {

        final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
        for ( int i = 0; i < numAlleles; i++ ) {
-            final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
-
-            if ( alleleBin.size() > targetAlleleCounts[i] ) {
-                readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
+            if ( alleleCounts[i] > targetAlleleCounts[i] ) {
+                readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i]));
            }
        }

        return readsToRemove;
    }

-    private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
-        if ( log != null ) {
-            for ( final T obj : elements ) {
-                logElement(obj, log);
-            }
+    /**
+     * Performs allele biased down-sampling on a pileup and computes the list of elements to remove
+     *
+     * @param reads                     original list of records
+     * @param numElementsToRemove       the number of records to remove
+     * @return the list of pileup elements TO REMOVE
+     */
+    protected static List<GATKSAMRecord> downsampleElements(final List<GATKSAMRecord> reads, final int numElementsToRemove) {
+        // are there no elements to remove?
+        if ( numElementsToRemove == 0 )
+            return Collections.<GATKSAMRecord>emptyList();
+
+        final ArrayList<GATKSAMRecord> elementsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
+        final int originalElementCount = reads.size();
+
+        // should we remove all of the elements?
+        if ( numElementsToRemove >= originalElementCount ) {
+            elementsToRemove.addAll(reads);
+            return elementsToRemove;
        }
-    }

-    private static <T> void logElement(final T obj, final PrintStream log) {
-        if ( log != null ) {
-
-            final GATKSAMRecord read;
-            if ( obj instanceof PileupElement )
-                read = ((PileupElement)obj).getRead();
-            else
-                read = (GATKSAMRecord)obj;
-
-            final SAMReadGroupRecord readGroup = read.getReadGroup();
-            log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
+        // create a bitset describing which elements to remove
+        final BitSet itemsToRemove = new BitSet(originalElementCount);
+        for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
+            itemsToRemove.set(selectedIndex);
        }
-    }

+        int currentBitSetIndex = 0;
+        for ( final GATKSAMRecord read : reads ) {
+            if ( read.isReducedRead() )
+                throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords");
+
+            if ( itemsToRemove.get(currentBitSetIndex++) )
+                elementsToRemove.add(read);
+        }
+
+        return elementsToRemove;
+    }

    /**
     * Create sample-contamination maps from file
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper;


 import com.google.java.contract.Ensures;
-import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.variant.variantcontext.Allele;

-import java.io.PrintStream;
 import java.util.*;

 /**
@ -44,9 +42,8 @@ import java.util.*;
 *   For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
 */
 public class PerReadAlleleLikelihoodMap {
-    private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
-    protected List<Allele> alleles;
-    protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
+    protected final List<Allele> alleles;
+    protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;

    public PerReadAlleleLikelihoodMap() {
        likelihoodReadMap = new LinkedHashMap<GATKSAMRecord,Map<Allele,Double>>();
@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap {

    }

-    public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
-        return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
+    public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
+        return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction);
    }

    /**
     * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion
     * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination
     * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed.
-     * @param log - a PrintStream to log the removed reads to (passed through to the utility function)
     */
-    public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
+    public void performPerAlleleDownsampling(final double downsamplingFraction) {
        // special case removal of all or no reads
        if ( downsamplingFraction <= 0.0 )
            return;
@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap {
        final Map<Allele, List<GATKSAMRecord>> alleleReadMap = getAlleleStratifiedReadMap();

        // compute the reads to remove and actually remove them
-        final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
+        final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction);
        for ( final GATKSAMRecord read : readsToRemove )
            likelihoodReadMap.remove(read);
    }
@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap {
            alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());

        for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
-            // do not remove reduced reads!
+            // TODO -- come up with a strategy for down-sampling reduced reads
+            // Currently we are unable to remove reduced reads because their representative base count differs throughout the read
            if ( !entry.getKey().isReducedRead() ) {
                final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
                if ( bestAllele.isInformative() )
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
@ -303,7 +303,7 @@ public class PileupElement implements Comparable<PileupElement> {
     * this being a reduced read and a deletion, we return the average number of elements between the left
     * and right elements to the deletion. We assume the deletion to be left aligned.
     *
-     * @return
+     * @return the representative count
     */
    public int getRepresentativeCount() {
        if (read.isReducedRead())     {
@ -318,6 +318,19 @@ public class PileupElement implements Comparable<PileupElement> {
        }
    }

+    /**
+     * Adjusts the representative count of this pileup element.
+     * Throws an exception if this element does not represent a reduced read.
+     *
+     * @param adjustmentFactor   how much to adjust the representative count (can be positive or negative)
+     */
+    public void adjustRepresentativeCount(final int adjustmentFactor) {
+        if ( read.isReducedRead() )
+            read.adjustReducedCount(offset, adjustmentFactor);
+        else
+            throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced");
+    }
+
    /**
     * Get the cigar element aligning this element to the genome
     * @return a non-null CigarElement
--- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord {
     * @return the number of bases corresponding to the i'th base of the reduced read
     */
    public final byte getReducedCount(final int i) {
-        return getReducedReadCounts()[i];
+        if ( !isReducedRead() )
+            throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced");
+        final byte[] reducedCounts = getReducedReadCounts();
+        return reducedCounts[i];
+    }
+
+    /**
+     * Sets the number of bases corresponding the i'th base of the reduced read.
+     *
+     * @param i      the read based coordinate inside the read
+     * @param count  the new count
+     */
+    public final void setReducedCount(final int i, final byte count) {
+        if ( count < 0 )
+            throw new IllegalArgumentException("the reduced count cannot be set to a negative value");
+        if ( !isReducedRead() )
+            throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
+        final byte[] reducedCounts = getReducedReadCounts();
+        reducedCounts[i] = count;
+        setReducedReadCountsTag(reducedCounts);
+    }
+
+    /**
+     * Sets the number of bases corresponding the i'th base of the reduced read.
+     *
+     * @param i                 the read based coordinate inside the read
+     * @param adjustmentFactor  how much to add/subtract to the current count
+     */
+    public final void adjustReducedCount(final int i, final int adjustmentFactor) {
+        if ( !isReducedRead() )
+            throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
+        final byte[] reducedCounts = getReducedReadCounts();
+        final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor);
+        setReducedCount(i, newCount);
    }

    /**
--- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java
@ -25,18 +25,21 @@

 package org.broadinstitute.sting.gatk.downsampling;

+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMFileHeader;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

 import java.io.File;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;


 /**
@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
        return true;
    }

+    @DataProvider(name = "BiasedDownsamplingTest")
+    public Object[][] makeBiasedDownsamplingTest() {
+        final List<Object[]> tests = new LinkedList<Object[]>();
+
+        final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+
+        for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) {
+            for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) {
+                for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) {
+                    if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount )
+                        continue;
+                    for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
+                        if ( toRemove <= originalNormalCount + originalReducedCount )
+                            tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove});
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "BiasedDownsamplingTest")
+    public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) {
+
+        final LinkedList<PileupElement> elements = new LinkedList<PileupElement>();
+        for ( int i = 0; i < originalNormalCount; i++ ) {
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
+            elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
+        }
+        if ( originalReducedCount > 0 ) {
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
+            read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount});
+            elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
+        }
+
+        final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove);
+        int pileupCount = 0;
+        for ( final PileupElement pe : elements ) // reduced reads may have gotten modified
+            pileupCount += pe.getRepresentativeCount();
+        for ( final PileupElement pe : result )
+            pileupCount -= pe.getRepresentativeCount();
+
+        Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove);
+    }

    @Test
    public void testLoadContaminationFileDetails(){
--- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java
@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
    final static String BASES = "ACTG";
    final static String QUALS = "!+5?";
    final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1};
-    final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9};  // just the offsets
+    final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9};  // just the offsets

    @BeforeClass
    public void init() {
@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
        reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
        reducedRead.setReadBases(BASES.getBytes());
        reducedRead.setBaseQualityString(QUALS);
-        reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG);
+        reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS);
    }

    @Test
@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest {
        }
    }

+    @Test(expectedExceptions = IllegalArgumentException.class)
+    public void testGetReducedCountOnNormalRead() {
+        read.getReducedCount(0);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class)
+    public void testSetReducedTagOnNormalRead() {
+        read.setReducedCount(0, (byte)2);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class)
+    public void testSetReducedCountToNegativeNumber() {
+        reducedRead.setReducedCount(0, (byte)1000);
+    }
+
+    @Test
+    public void testSetReducedTagOnReducedRead() {
+        for (int i = 0; i < reducedRead.getReadLength(); i++) {
+            final byte newCount = (byte)i;
+            reducedRead.setReducedCount(i, newCount);
+            Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
+        }
+
+        for (int i = 0; i < reducedRead.getReadLength(); i++) {
+            final int newCount = reducedRead.getReducedCount(i) + i;
+            reducedRead.adjustReducedCount(i, i);
+            Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
+        }
+    }
+
    @Test
    public void testReducedReadPileupElement() {
        PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);