Extended the allele-biased down-sampling functionality to handle reduced reads.

Note that this works only in the case of pileups (i.e. coming from UG);
allele-biased down-sampling for RR just cannot work for haplotypes.

Added lots of unit tests for new functionality.
This commit is contained in:
Eric Banks 2013-04-24 11:52:26 -04:00
parent b749f06ba6
commit ba2c3b57ed
14 changed files with 268 additions and 111 deletions

View File

@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Map;
@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection {
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
@Hidden
@Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
public PrintStream contaminationLog = null;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection {
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
this.contaminationLog = SCAC.contaminationLog;
this.exactCallsLog = SCAC.exactCallsLog;
this.sampleContamination=SCAC.sampleContamination;
this.AFmodel = SCAC.AFmodel;

View File

@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog);
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());

View File

@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
final Double contamination = UAC.getSampleContamination().get(sample.getKey());
if( contamination > 0.0 ) //no need to enter if no contamination reduction
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog);
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination);
if ( useBAQedPileup )
pileup = createBAQedPileup(pileup);

View File

@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GenotypingEngine {
@ -196,13 +195,13 @@ public class GenotypingEngine {
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
if( call != null ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) );
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = call;
@ -406,8 +405,7 @@ public class GenotypingEngine {
// BUGBUG: ugh, too complicated
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<Allele, List<Haplotype>> alleleMapper,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final double downsamplingFraction ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
@ -424,7 +422,7 @@ public class GenotypingEngine {
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}

View File

@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
final ReferenceContext ref,
final int eventLength,
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
final double downsamplingFraction,
final PrintStream downsamplingLog) {
final double downsamplingFraction) {
final int numHaplotypes = haplotypeMap.size();
final int readCounts[] = new int[pileup.getNumberOfElements()];
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}

View File

@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
Arrays.asList("f82389f2bc6d3f932a36be65b60af648"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6");
testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b");
}
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515");
}

View File

@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10);
Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1);
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10)));
Map<Allele,List<GATKSAMRecord>> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap();

View File

@ -104,8 +104,9 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
public boolean disableRandomization = false;
@Hidden
@Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.")
public boolean disableDithering = false;
@Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.sting.utils.exceptions.StingException;
@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.log4j.Logger;
public class AlleleBiasedDownsamplingUtils {
// define this class so that we can use Java generics below
private final static class PileupElementList extends ArrayList<PileupElement> {}
/**
* Computes an allele biased version of the given pileup
*
* @param pileup the original pileup
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return allele biased pileup
*/
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return pileup;
if ( downsamplingFraction >= 1.0 )
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
final PileupElementList[] alleleStratifiedElements = new PileupElementList[4];
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
alleleStratifiedElements[i] = new PileupElementList();
// start by stratifying the reads by the alleles they represent at this position
boolean sawReducedRead = false;
for ( final PileupElement pe : pileup ) {
// we do not want to remove a reduced read
if ( !pe.getRead().isReducedRead() ) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
if ( pe.getRead().isReducedRead() )
sawReducedRead = true;
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
// make a listing of allele counts
final int[] alleleCounts = new int[4];
for ( int i = 0; i < 4; i++ )
alleleCounts[i] = alleleStratifiedElements[i].size();
// make a listing of allele counts and calculate the total count
final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead);
final int totalAlleleCount = (int)MathUtils.sum(alleleCounts);
// do smart down-sampling
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
final PileupElementList alleleList = alleleStratifiedElements[i];
// if we don't need to remove any reads, then don't
if ( alleleList.size() > targetAlleleCounts[i] )
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
if ( alleleCounts[i] > targetAlleleCounts[i] )
readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i]));
}
// clean up pointers so memory can be garbage collected if needed
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i].clear();
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(totalAlleleCount - numReadsToRemove);
for ( final PileupElement pe : pileup ) {
if ( !readsToRemove.contains(pe) ) {
readsToKeep.add(pe);
@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils {
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
}
/**
* Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present)
*
* @param alleleStratifiedElements pileup elements stratified by allele
* @param sawReducedRead is at least one read a reduced read?
* @return non-null int array representing allele counts
*/
private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) {
final int[] alleleCounts = new int[alleleStratifiedElements.length];
for ( int i = 0; i < alleleStratifiedElements.length; i++ ) {
if ( !sawReducedRead ) {
alleleCounts[i] = alleleStratifiedElements[i].size();
} else {
for ( final PileupElement pe : alleleStratifiedElements[i] )
alleleCounts[i] += pe.getRepresentativeCount();
}
}
return alleleCounts;
}
private static int scoreAlleleCounts(final int[] alleleCounts) {
if ( alleleCounts.length < 2 )
return 0;
@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils {
}
/**
* Computes an allele biased version of the given pileup
* Computes an allele biased version of the allele counts for a given pileup
*
* @param alleleCounts the original pileup
* @param numReadsToRemove fraction of total reads to remove per allele
* @return allele biased pileup
* @param alleleCounts the allele counts for the original pileup
* @param numReadsToRemove number of total reads to remove per allele
* @return non-null array of new counts needed per allele
*/
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
final int numAlleles = alleleCounts.length;
@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils {
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param elements original list of records
* @param elements original list of pileup elements
* @param originalElementCount original count of elements (taking reduced reads into account)
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO REMOVE
*/
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
protected static List<PileupElement> downsampleElements(final List<PileupElement> elements, final int originalElementCount, final int numElementsToRemove) {
// are there no elements to remove?
if ( numElementsToRemove == 0 )
return elementsToRemove;
return Collections.<PileupElement>emptyList();
final ArrayList<PileupElement> elementsToRemove = new ArrayList<PileupElement>(numElementsToRemove);
// should we remove all of the elements?
final int pileupSize = elements.size();
if ( numElementsToRemove == pileupSize ) {
logAllElements(elements, log);
if ( numElementsToRemove >= originalElementCount ) {
elementsToRemove.addAll(elements);
return elementsToRemove;
}
// create a bitset describing which elements to remove
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
final BitSet itemsToRemove = new BitSet(originalElementCount);
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final T element = elements.get(i);
logElement(element, log);
int currentBitSetIndex = 0;
for ( final PileupElement element : elements ) {
final int representativeCount = element.getRepresentativeCount();
// if it's a reduced read, we need to be smart about how we down-sample
if ( representativeCount > 1 ) {
// count how many bits are set over the span represented by this read
int setBits = 0;
for ( int i = 0; i < representativeCount; i++ )
setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0;
// remove that count from the count of the reduced read
if ( setBits == representativeCount )
elementsToRemove.add(element);
else
element.adjustRepresentativeCount(-1 * setBits);
}
// otherwise it's trivial: remove if the corresponding bit is set
else if ( itemsToRemove.get(currentBitSetIndex++) ) {
elementsToRemove.add(element);
}
}
@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils {
*
* @param alleleReadMap original list of records per allele
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return list of reads TO REMOVE from allele biased down-sampling
*/
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) {
int totalReads = 0;
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
totalReads += reads.size();
@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils {
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
final int numAlleles = alleles.size();
// TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead
final int[] alleleCounts = new int[numAlleles];
for ( int i = 0; i < numAlleles; i++ )
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils {
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
for ( int i = 0; i < numAlleles; i++ ) {
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
if ( alleleBin.size() > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
if ( alleleCounts[i] > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i]));
}
}
return readsToRemove;
}
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
if ( log != null ) {
for ( final T obj : elements ) {
logElement(obj, log);
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param reads original list of records
* @param numElementsToRemove the number of records to remove
* @return the list of pileup elements TO REMOVE
*/
protected static List<GATKSAMRecord> downsampleElements(final List<GATKSAMRecord> reads, final int numElementsToRemove) {
// are there no elements to remove?
if ( numElementsToRemove == 0 )
return Collections.<GATKSAMRecord>emptyList();
final ArrayList<GATKSAMRecord> elementsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
final int originalElementCount = reads.size();
// should we remove all of the elements?
if ( numElementsToRemove >= originalElementCount ) {
elementsToRemove.addAll(reads);
return elementsToRemove;
}
}
private static <T> void logElement(final T obj, final PrintStream log) {
if ( log != null ) {
final GATKSAMRecord read;
if ( obj instanceof PileupElement )
read = ((PileupElement)obj).getRead();
else
read = (GATKSAMRecord)obj;
final SAMReadGroupRecord readGroup = read.getReadGroup();
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
// create a bitset describing which elements to remove
final BitSet itemsToRemove = new BitSet(originalElementCount);
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
}
int currentBitSetIndex = 0;
for ( final GATKSAMRecord read : reads ) {
if ( read.isReducedRead() )
throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords");
if ( itemsToRemove.get(currentBitSetIndex++) )
elementsToRemove.add(read);
}
return elementsToRemove;
}
/**
* Create sample-contamination maps from file

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper;
import com.google.java.contract.Ensures;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.PrintStream;
import java.util.*;
/**
@ -44,9 +42,8 @@ import java.util.*;
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
*/
public class PerReadAlleleLikelihoodMap {
private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
protected List<Allele> alleles;
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
protected final List<Allele> alleles;
protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
public PerReadAlleleLikelihoodMap() {
likelihoodReadMap = new LinkedHashMap<GATKSAMRecord,Map<Allele,Double>>();
@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap {
}
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction);
}
/**
* For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion
* of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination
* @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed.
* @param log - a PrintStream to log the removed reads to (passed through to the utility function)
*/
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
public void performPerAlleleDownsampling(final double downsamplingFraction) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return;
@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = getAlleleStratifiedReadMap();
// compute the reads to remove and actually remove them
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction);
for ( final GATKSAMRecord read : readsToRemove )
likelihoodReadMap.remove(read);
}
@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap {
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
// do not remove reduced reads!
// TODO -- come up with a strategy for down-sampling reduced reads
// Currently we are unable to remove reduced reads because their representative base count differs throughout the read
if ( !entry.getKey().isReducedRead() ) {
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
if ( bestAllele.isInformative() )

View File

@ -303,7 +303,7 @@ public class PileupElement implements Comparable<PileupElement> {
* this being a reduced read and a deletion, we return the average number of elements between the left
* and right elements to the deletion. We assume the deletion to be left aligned.
*
* @return
* @return the representative count
*/
public int getRepresentativeCount() {
if (read.isReducedRead()) {
@ -318,6 +318,19 @@ public class PileupElement implements Comparable<PileupElement> {
}
}
/**
* Adjusts the representative count of this pileup element.
* Throws an exception if this element does not represent a reduced read.
*
* @param adjustmentFactor how much to adjust the representative count (can be positive or negative)
*/
public void adjustRepresentativeCount(final int adjustmentFactor) {
if ( read.isReducedRead() )
read.adjustReducedCount(offset, adjustmentFactor);
else
throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced");
}
/**
* Get the cigar element aligning this element to the genome
* @return a non-null CigarElement

View File

@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord {
* @return the number of bases corresponding to the i'th base of the reduced read
*/
public final byte getReducedCount(final int i) {
return getReducedReadCounts()[i];
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
return reducedCounts[i];
}
/**
* Sets the number of bases corresponding the i'th base of the reduced read.
*
* @param i the read based coordinate inside the read
* @param count the new count
*/
public final void setReducedCount(final int i, final byte count) {
if ( count < 0 )
throw new IllegalArgumentException("the reduced count cannot be set to a negative value");
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
reducedCounts[i] = count;
setReducedReadCountsTag(reducedCounts);
}
/**
* Sets the number of bases corresponding the i'th base of the reduced read.
*
* @param i the read based coordinate inside the read
* @param adjustmentFactor how much to add/subtract to the current count
*/
public final void adjustReducedCount(final int i, final int adjustmentFactor) {
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor);
setReducedCount(i, newCount);
}
/**

View File

@ -25,18 +25,21 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.*;
/**
@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
return true;
}
@DataProvider(name = "BiasedDownsamplingTest")
public Object[][] makeBiasedDownsamplingTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) {
for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) {
for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) {
if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount )
continue;
for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
if ( toRemove <= originalNormalCount + originalReducedCount )
tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BiasedDownsamplingTest")
public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) {
final LinkedList<PileupElement> elements = new LinkedList<PileupElement>();
for ( int i = 0; i < originalNormalCount; i++ ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
}
if ( originalReducedCount > 0 ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount});
elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
}
final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove);
int pileupCount = 0;
for ( final PileupElement pe : elements ) // reduced reads may have gotten modified
pileupCount += pe.getRepresentativeCount();
for ( final PileupElement pe : result )
pileupCount -= pe.getRepresentativeCount();
Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove);
}
@Test
public void testLoadContaminationFileDetails(){

View File

@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
final static String BASES = "ACTG";
final static String QUALS = "!+5?";
final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1};
final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets
final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets
@BeforeClass
public void init() {
@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
reducedRead.setReadBases(BASES.getBytes());
reducedRead.setBaseQualityString(QUALS);
reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG);
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS);
}
@Test
@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest {
}
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testGetReducedCountOnNormalRead() {
read.getReducedCount(0);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testSetReducedTagOnNormalRead() {
read.setReducedCount(0, (byte)2);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testSetReducedCountToNegativeNumber() {
reducedRead.setReducedCount(0, (byte)1000);
}
@Test
public void testSetReducedTagOnReducedRead() {
for (int i = 0; i < reducedRead.getReadLength(); i++) {
final byte newCount = (byte)i;
reducedRead.setReducedCount(i, newCount);
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
}
for (int i = 0; i < reducedRead.getReadLength(); i++) {
final int newCount = reducedRead.getReducedCount(i) + i;
reducedRead.adjustReducedCount(i, i);
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
}
}
@Test
public void testReducedReadPileupElement() {
PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);