Extended the allele-biased down-sampling functionality to handle reduced reads.
Note that this works only in the case of pileups (i.e. coming from UG); allele-biased down-sampling for RR just cannot work for haplotypes. Added lots of unit tests for new functionality.
This commit is contained in:
parent
b749f06ba6
commit
ba2c3b57ed
|
|
@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
|
|||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
|
||||
public PrintStream contaminationLog = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
|
|
@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection {
|
|||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
|
||||
this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
|
||||
this.contaminationLog = SCAC.contaminationLog;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.sampleContamination=SCAC.sampleContamination;
|
||||
this.AFmodel = SCAC.AFmodel;
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
if (pileup != null) {
|
||||
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog);
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
|
||||
b.PL(genotypeLikelihoods);
|
||||
b.DP(getFilteredDepth(pileup));
|
||||
genotypes.add(b.make());
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
final Double contamination = UAC.getSampleContamination().get(sample.getKey());
|
||||
if( contamination > 0.0 ) //no need to enter if no contamination reduction
|
||||
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog);
|
||||
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination);
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup(pileup);
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
|
|
@ -196,13 +195,13 @@ public class GenotypingEngine {
|
|||
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
|
||||
|
||||
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
|
||||
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
if( call != null ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
|
||||
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) );
|
||||
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
|
||||
|
||||
VariantContext annotatedCall = call;
|
||||
|
|
@ -406,8 +405,7 @@ public class GenotypingEngine {
|
|||
// BUGBUG: ugh, too complicated
|
||||
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final double downsamplingFraction ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
|
|
@ -424,7 +422,7 @@ public class GenotypingEngine {
|
|||
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
|
||||
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
|
@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
|
|||
final ReferenceContext ref,
|
||||
final int eventLength,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog) {
|
||||
final double downsamplingFraction) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
|
||||
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
|
||||
Arrays.asList("f82389f2bc6d3f932a36be65b60af648"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6");
|
||||
testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
|
||||
testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10);
|
||||
Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1);
|
||||
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10)));
|
||||
|
||||
Map<Allele,List<GATKSAMRecord>> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap();
|
||||
|
|
|
|||
|
|
@ -104,8 +104,9 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
|
||||
public boolean nonDeterministicRandomSeed = false;
|
||||
|
||||
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
|
||||
public boolean disableRandomization = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.")
|
||||
public boolean disableDithering = false;
|
||||
|
||||
@Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
|
||||
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class AlleleBiasedDownsamplingUtils {
|
||||
|
||||
// define this class so that we can use Java generics below
|
||||
private final static class PileupElementList extends ArrayList<PileupElement> {}
|
||||
|
||||
/**
|
||||
* Computes an allele biased version of the given pileup
|
||||
*
|
||||
* @param pileup the original pileup
|
||||
* @param downsamplingFraction the fraction of total reads to remove per allele
|
||||
* @param log logging output
|
||||
* @return allele biased pileup
|
||||
*/
|
||||
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
|
||||
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
|
||||
// special case removal of all or no reads
|
||||
if ( downsamplingFraction <= 0.0 )
|
||||
return pileup;
|
||||
if ( downsamplingFraction >= 1.0 )
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
|
||||
|
||||
final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
|
||||
final PileupElementList[] alleleStratifiedElements = new PileupElementList[4];
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
alleleStratifiedElements[i] = new PileupElementList();
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
boolean sawReducedRead = false;
|
||||
for ( final PileupElement pe : pileup ) {
|
||||
// we do not want to remove a reduced read
|
||||
if ( !pe.getRead().isReducedRead() ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
if ( pe.getRead().isReducedRead() )
|
||||
sawReducedRead = true;
|
||||
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// make a listing of allele counts
|
||||
final int[] alleleCounts = new int[4];
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
// make a listing of allele counts and calculate the total count
|
||||
final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead);
|
||||
final int totalAlleleCount = (int)MathUtils.sum(alleleCounts);
|
||||
|
||||
// do smart down-sampling
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
final PileupElementList alleleList = alleleStratifiedElements[i];
|
||||
// if we don't need to remove any reads, then don't
|
||||
if ( alleleList.size() > targetAlleleCounts[i] )
|
||||
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||
if ( alleleCounts[i] > targetAlleleCounts[i] )
|
||||
readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i]));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
|
||||
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
|
||||
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(totalAlleleCount - numReadsToRemove);
|
||||
for ( final PileupElement pe : pileup ) {
|
||||
if ( !readsToRemove.contains(pe) ) {
|
||||
readsToKeep.add(pe);
|
||||
|
|
@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present)
|
||||
*
|
||||
* @param alleleStratifiedElements pileup elements stratified by allele
|
||||
* @param sawReducedRead is at least one read a reduced read?
|
||||
* @return non-null int array representing allele counts
|
||||
*/
|
||||
private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) {
|
||||
final int[] alleleCounts = new int[alleleStratifiedElements.length];
|
||||
for ( int i = 0; i < alleleStratifiedElements.length; i++ ) {
|
||||
if ( !sawReducedRead ) {
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
} else {
|
||||
for ( final PileupElement pe : alleleStratifiedElements[i] )
|
||||
alleleCounts[i] += pe.getRepresentativeCount();
|
||||
}
|
||||
}
|
||||
return alleleCounts;
|
||||
}
|
||||
|
||||
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
||||
if ( alleleCounts.length < 2 )
|
||||
return 0;
|
||||
|
|
@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Computes an allele biased version of the given pileup
|
||||
* Computes an allele biased version of the allele counts for a given pileup
|
||||
*
|
||||
* @param alleleCounts the original pileup
|
||||
* @param numReadsToRemove fraction of total reads to remove per allele
|
||||
* @return allele biased pileup
|
||||
* @param alleleCounts the allele counts for the original pileup
|
||||
* @param numReadsToRemove number of total reads to remove per allele
|
||||
* @return non-null array of new counts needed per allele
|
||||
*/
|
||||
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
|
||||
final int numAlleles = alleleCounts.length;
|
||||
|
|
@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||
*
|
||||
* @param elements original list of records
|
||||
* @param elements original list of pileup elements
|
||||
* @param originalElementCount original count of elements (taking reduced reads into account)
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @param log logging output
|
||||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
|
||||
|
||||
protected static List<PileupElement> downsampleElements(final List<PileupElement> elements, final int originalElementCount, final int numElementsToRemove) {
|
||||
// are there no elements to remove?
|
||||
if ( numElementsToRemove == 0 )
|
||||
return elementsToRemove;
|
||||
return Collections.<PileupElement>emptyList();
|
||||
|
||||
final ArrayList<PileupElement> elementsToRemove = new ArrayList<PileupElement>(numElementsToRemove);
|
||||
|
||||
// should we remove all of the elements?
|
||||
final int pileupSize = elements.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllElements(elements, log);
|
||||
if ( numElementsToRemove >= originalElementCount ) {
|
||||
elementsToRemove.addAll(elements);
|
||||
return elementsToRemove;
|
||||
}
|
||||
|
||||
// create a bitset describing which elements to remove
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
final BitSet itemsToRemove = new BitSet(originalElementCount);
|
||||
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final T element = elements.get(i);
|
||||
logElement(element, log);
|
||||
int currentBitSetIndex = 0;
|
||||
for ( final PileupElement element : elements ) {
|
||||
|
||||
final int representativeCount = element.getRepresentativeCount();
|
||||
|
||||
// if it's a reduced read, we need to be smart about how we down-sample
|
||||
if ( representativeCount > 1 ) {
|
||||
// count how many bits are set over the span represented by this read
|
||||
int setBits = 0;
|
||||
for ( int i = 0; i < representativeCount; i++ )
|
||||
setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0;
|
||||
|
||||
// remove that count from the count of the reduced read
|
||||
if ( setBits == representativeCount )
|
||||
elementsToRemove.add(element);
|
||||
else
|
||||
element.adjustRepresentativeCount(-1 * setBits);
|
||||
}
|
||||
// otherwise it's trivial: remove if the corresponding bit is set
|
||||
else if ( itemsToRemove.get(currentBitSetIndex++) ) {
|
||||
elementsToRemove.add(element);
|
||||
}
|
||||
}
|
||||
|
|
@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
*
|
||||
* @param alleleReadMap original list of records per allele
|
||||
* @param downsamplingFraction the fraction of total reads to remove per allele
|
||||
* @param log logging output
|
||||
* @return list of reads TO REMOVE from allele biased down-sampling
|
||||
*/
|
||||
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
|
||||
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) {
|
||||
int totalReads = 0;
|
||||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
|
||||
totalReads += reads.size();
|
||||
|
|
@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
|
||||
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
|
||||
final int numAlleles = alleles.size();
|
||||
|
||||
// TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead
|
||||
final int[] alleleCounts = new int[numAlleles];
|
||||
for ( int i = 0; i < numAlleles; i++ )
|
||||
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
|
||||
|
|
@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
|
||||
for ( int i = 0; i < numAlleles; i++ ) {
|
||||
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
||||
|
||||
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
||||
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||
if ( alleleCounts[i] > targetAlleleCounts[i] ) {
|
||||
readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i]));
|
||||
}
|
||||
}
|
||||
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final T obj : elements ) {
|
||||
logElement(obj, log);
|
||||
}
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||
*
|
||||
* @param reads original list of records
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
protected static List<GATKSAMRecord> downsampleElements(final List<GATKSAMRecord> reads, final int numElementsToRemove) {
|
||||
// are there no elements to remove?
|
||||
if ( numElementsToRemove == 0 )
|
||||
return Collections.<GATKSAMRecord>emptyList();
|
||||
|
||||
final ArrayList<GATKSAMRecord> elementsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
|
||||
final int originalElementCount = reads.size();
|
||||
|
||||
// should we remove all of the elements?
|
||||
if ( numElementsToRemove >= originalElementCount ) {
|
||||
elementsToRemove.addAll(reads);
|
||||
return elementsToRemove;
|
||||
}
|
||||
}
|
||||
|
||||
private static <T> void logElement(final T obj, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
|
||||
final GATKSAMRecord read;
|
||||
if ( obj instanceof PileupElement )
|
||||
read = ((PileupElement)obj).getRead();
|
||||
else
|
||||
read = (GATKSAMRecord)obj;
|
||||
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
|
||||
// create a bitset describing which elements to remove
|
||||
final BitSet itemsToRemove = new BitSet(originalElementCount);
|
||||
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
}
|
||||
|
||||
int currentBitSetIndex = 0;
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
if ( read.isReducedRead() )
|
||||
throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords");
|
||||
|
||||
if ( itemsToRemove.get(currentBitSetIndex++) )
|
||||
elementsToRemove.add(read);
|
||||
}
|
||||
|
||||
return elementsToRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create sample-contamination maps from file
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper;
|
|||
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -44,9 +42,8 @@ import java.util.*;
|
|||
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
*/
|
||||
public class PerReadAlleleLikelihoodMap {
|
||||
private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
|
||||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
protected final List<Allele> alleles;
|
||||
protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
|
||||
public PerReadAlleleLikelihoodMap() {
|
||||
likelihoodReadMap = new LinkedHashMap<GATKSAMRecord,Map<Allele,Double>>();
|
||||
|
|
@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap {
|
|||
|
||||
}
|
||||
|
||||
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
|
||||
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
|
||||
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
|
||||
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction);
|
||||
}
|
||||
|
||||
/**
|
||||
* For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion
|
||||
* of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination
|
||||
* @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed.
|
||||
* @param log - a PrintStream to log the removed reads to (passed through to the utility function)
|
||||
*/
|
||||
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
|
||||
public void performPerAlleleDownsampling(final double downsamplingFraction) {
|
||||
// special case removal of all or no reads
|
||||
if ( downsamplingFraction <= 0.0 )
|
||||
return;
|
||||
|
|
@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap {
|
|||
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = getAlleleStratifiedReadMap();
|
||||
|
||||
// compute the reads to remove and actually remove them
|
||||
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
|
||||
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction);
|
||||
for ( final GATKSAMRecord read : readsToRemove )
|
||||
likelihoodReadMap.remove(read);
|
||||
}
|
||||
|
|
@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap {
|
|||
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
|
||||
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
// do not remove reduced reads!
|
||||
// TODO -- come up with a strategy for down-sampling reduced reads
|
||||
// Currently we are unable to remove reduced reads because their representative base count differs throughout the read
|
||||
if ( !entry.getKey().isReducedRead() ) {
|
||||
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele.isInformative() )
|
||||
|
|
|
|||
|
|
@ -303,7 +303,7 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
* this being a reduced read and a deletion, we return the average number of elements between the left
|
||||
* and right elements to the deletion. We assume the deletion to be left aligned.
|
||||
*
|
||||
* @return
|
||||
* @return the representative count
|
||||
*/
|
||||
public int getRepresentativeCount() {
|
||||
if (read.isReducedRead()) {
|
||||
|
|
@ -318,6 +318,19 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjusts the representative count of this pileup element.
|
||||
* Throws an exception if this element does not represent a reduced read.
|
||||
*
|
||||
* @param adjustmentFactor how much to adjust the representative count (can be positive or negative)
|
||||
*/
|
||||
public void adjustRepresentativeCount(final int adjustmentFactor) {
|
||||
if ( read.isReducedRead() )
|
||||
read.adjustReducedCount(offset, adjustmentFactor);
|
||||
else
|
||||
throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cigar element aligning this element to the genome
|
||||
* @return a non-null CigarElement
|
||||
|
|
|
|||
|
|
@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return the number of bases corresponding to the i'th base of the reduced read
|
||||
*/
|
||||
public final byte getReducedCount(final int i) {
|
||||
return getReducedReadCounts()[i];
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced");
|
||||
final byte[] reducedCounts = getReducedReadCounts();
|
||||
return reducedCounts[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the number of bases corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @param count the new count
|
||||
*/
|
||||
public final void setReducedCount(final int i, final byte count) {
|
||||
if ( count < 0 )
|
||||
throw new IllegalArgumentException("the reduced count cannot be set to a negative value");
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
|
||||
final byte[] reducedCounts = getReducedReadCounts();
|
||||
reducedCounts[i] = count;
|
||||
setReducedReadCountsTag(reducedCounts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the number of bases corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @param adjustmentFactor how much to add/subtract to the current count
|
||||
*/
|
||||
public final void adjustReducedCount(final int i, final int adjustmentFactor) {
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
|
||||
final byte[] reducedCounts = getReducedReadCounts();
|
||||
final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor);
|
||||
setReducedCount(i, newCount);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -25,18 +25,21 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
|
|||
return true;
|
||||
}
|
||||
|
||||
@DataProvider(name = "BiasedDownsamplingTest")
|
||||
public Object[][] makeBiasedDownsamplingTest() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
|
||||
for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) {
|
||||
for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) {
|
||||
for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) {
|
||||
if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount )
|
||||
continue;
|
||||
for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
|
||||
if ( toRemove <= originalNormalCount + originalReducedCount )
|
||||
tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BiasedDownsamplingTest")
|
||||
public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) {
|
||||
|
||||
final LinkedList<PileupElement> elements = new LinkedList<PileupElement>();
|
||||
for ( int i = 0; i < originalNormalCount; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
|
||||
elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
|
||||
}
|
||||
if ( originalReducedCount > 0 ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
|
||||
read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount});
|
||||
elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
|
||||
}
|
||||
|
||||
final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove);
|
||||
int pileupCount = 0;
|
||||
for ( final PileupElement pe : elements ) // reduced reads may have gotten modified
|
||||
pileupCount += pe.getRepresentativeCount();
|
||||
for ( final PileupElement pe : result )
|
||||
pileupCount -= pe.getRepresentativeCount();
|
||||
|
||||
Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLoadContaminationFileDetails(){
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
final static String BASES = "ACTG";
|
||||
final static String QUALS = "!+5?";
|
||||
final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1};
|
||||
final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets
|
||||
final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
|
|
@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
|
||||
reducedRead.setReadBases(BASES.getBytes());
|
||||
reducedRead.setBaseQualityString(QUALS);
|
||||
reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG);
|
||||
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testGetReducedCountOnNormalRead() {
|
||||
read.getReducedCount(0);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testSetReducedTagOnNormalRead() {
|
||||
read.setReducedCount(0, (byte)2);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testSetReducedCountToNegativeNumber() {
|
||||
reducedRead.setReducedCount(0, (byte)1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetReducedTagOnReducedRead() {
|
||||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
final byte newCount = (byte)i;
|
||||
reducedRead.setReducedCount(i, newCount);
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
final int newCount = reducedRead.getReducedCount(i) + i;
|
||||
reducedRead.adjustReducedCount(i, i);
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedReadPileupElement() {
|
||||
PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);
|
||||
|
|
|
|||
Loading…
Reference in New Issue