diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java new file mode 100755 index 000000000..e224a9c57 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.PrintStream; +import java.util.*; + +public class AlleleBiasedDownsamplingUtils { + + private static final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + static { + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); + } + + /** + * Computes an allele biased version of the given pileup + * + * @param pileup the original pileup + * @param downsamplingFraction the fraction of total reads to remove per allele + * @param log logging output + * @return allele biased pileup + */ + public synchronized static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return pileup; + if ( downsamplingFraction >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() <= numReadsToRemove ) + logAllElements(alleleList, log); + else + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to keep + * + * @param elements original list of records + * @param numElementsToRemove the number of records to remove + * @param log logging output + * @return the list of pileup elements TO KEEP + */ + private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( itemsToRemove.get(i) ) + logRead(elements.get(i).getRead(), log); + else + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + + /** + * Computes reads to remove based on an allele biased down-sampling + * + * @param alleleReadMap original list of records per allele + * @param downsamplingFraction the fraction of total reads to remove per allele + * @param log logging output + * @return list of reads TO REMOVE from allele biased down-sampling + */ + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction, final PrintStream log) { + int totalReads = 0; + for ( final List reads : alleleReadMap.values() ) + totalReads += reads.size(); + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + int numReadsToRemove = (int)(totalReads * downsamplingFraction); + final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size()); + for ( final List reads : alleleReadMap.values() ) { + if ( reads.size() <= numReadsToRemove ) { + readsToRemove.addAll(reads); + logAllReads(reads, log); + } else { + readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log)); + } + } + + return readsToRemove; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @param log logging output + * @return the list of pileup elements TO REMOVE + */ + private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { + final int pileupSize = reads.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( itemsToRemove.get(i) ) { + final GATKSAMRecord read = reads.get(i); + readsToRemove.add(read); + logRead(read, log); + } + } + + return readsToRemove; + } + + private static void logAllElements(final List elements, final PrintStream log) { + if ( log != null ) { + for ( final PileupElement p : elements ) + logRead(p.getRead(), log); + } + } + + private static void logAllReads(final List reads, final PrintStream log) { + if ( log != null ) { + for ( final GATKSAMRecord read : reads ) + logRead(read, log); + } + } + + private static void logRead(final SAMRecord read, final PrintStream log) { + if ( log != null ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index a01973ad5..fc6d23382 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -59,7 +60,7 @@ public class ErrorModel { boolean hasCalledAlleles = false; - final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); if (refSampleVC != null) { for (Allele allele : refSampleVC.getAlleles()) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3293ac817..5aba23faa 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -424,7 +425,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } - final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); + final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a05e20bd3..e1a9ef7c8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.PrintStream; import java.util.*; public class LikelihoodCalculationEngine { @@ -346,7 +347,9 @@ public class LikelihoodCalculationEngine { public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, - final Pair>> call) { + final Pair>> call, + final double downsamplingFraction, + final PrintStream downsamplingLog ) { final Map returnMap = new HashMap(); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); for( final Map.Entry> sample : perSampleReadList.entrySet() ) { @@ -370,6 +373,9 @@ public class LikelihoodCalculationEngine { } } + // down-sample before adding filtered reads + likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all diff --git a/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java b/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java index db1bf8762..80618cfdb 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java +++ b/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java @@ -25,16 +25,43 @@ package org.broadinstitute.sting.utils.genotyper; +import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; +import java.io.PrintStream; import java.util.*; public class AdvancedPerReadAlleleLikelihoodMap extends StandardPerReadAlleleLikelihoodMap implements ProtectedPackageSource { - public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) { - return this; + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log); } - } + + public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return; + if ( downsamplingFraction >= 1.0 ) { + likelihoodReadMap.clear(); + return; + } + + // start by stratifying the reads by the alleles they represent at this position + final Map> alleleReadMap = new HashMap>(alleles.size()); + for ( Allele allele : alleles ) + alleleReadMap.put(allele, new ArrayList()); + + for ( Map.Entry> entry : likelihoodReadMap.entrySet() ) { + final Allele bestAllele = getMostLikelyAllele(entry.getValue()); + alleleReadMap.get(bestAllele).add(entry.getKey()); + } + + // compute the reads to remove and actually remove them + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log); + for ( final GATKSAMRecord read : readsToRemove ) + likelihoodReadMap.remove(read); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index aa3a1e6ac..36456ca09 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.PrintStream; /** * Created with IntelliJ IDEA. @@ -63,26 +64,23 @@ public class StandardCallerArgumentCollection { public int MAX_ALTERNATE_ALLELES = 6; /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. - * - * This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. */ - @Deprecated - @Hidden - @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false) - public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1; + @Advanced + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we * will try to remove (N * contamination fraction) bases for each alternate allele. */ - @Hidden @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) - public double CONTAMINATION_PERCENTAGE = 0.0; + public double CONTAMINATION_FRACTION = 0.0; + + @Hidden + @Argument(shortName = "logRemovedReadsFromContaminationFiltering", doc="contaminationLog", required=false) + public PrintStream contaminationLog = null; @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) @@ -96,19 +94,12 @@ public class StandardCallerArgumentCollection { this.GenotypingMode = SCAC.GenotypingMode; this.heterozygosity = SCAC.heterozygosity; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; + this.contaminationLog = SCAC.contaminationLog; this.exactCallsLog = SCAC.exactCallsLog; this.AFmodel = SCAC.AFmodel; } - - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index e1092454a..0d9f443e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.*; @@ -81,7 +82,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser, - final Map perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap) { GenomeLoc loc = ref.getLocus(); // if (!ref.getLocus().equals(lastSiteVisited)) { @@ -118,12 +119,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ // no likelihoods have been computed for this sample at this site - perReadAlleleLikelihoodMap.put(sample.getKey(), org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap()); + perReadAlleleLikelihoodMap.put(sample.getKey(), PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap()); } final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey())); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.CONTAMINATION_FRACTION, UAC.contaminationLog); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 54303e12c..791cdc325 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -48,13 +49,13 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + + private final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new ArrayList(); + perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -64,9 +65,9 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser, - final Map perReadAlleleLikelihoodMap) { + final Map sampleLikelihoodMap) { - perReadAlleleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data + sampleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); @@ -79,8 +80,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) - pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); + if ( UAC.CONTAMINATION_FRACTION > 0.0 ) + pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, UAC.CONTAMINATION_FRACTION, UAC.contaminationLog); if ( useBAQedPileup ) pileup = createBAQedPileup(pileup); @@ -203,42 +204,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } - public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { - // special case removal of all reads - if ( contaminationPercentage >= 1.0 ) - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - - // start by stratifying the reads by the alleles they represent at this position - for( final PileupElement pe : pileup ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } - - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. - // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. - int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); - final TreeSet elementsToKeep = new TreeSet(new Comparator() { - @Override - public int compare(PileupElement element1, PileupElement element2) { - final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); - return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); - } - }); - - for ( int i = 0; i < 4; i++ ) { - final ArrayList alleleList = alleleStratifiedElements[i]; - if ( alleleList.size() > numReadsToRemove ) - elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); - } - - // clean up pointers so memory can be garbage collected if needed - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i].clear(); - - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); - } - public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -257,22 +222,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } - private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { - final int pileupSize = elements.size(); - final BitSet itemsToRemove = new BitSet(pileupSize); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); - for ( int i = 0; i < pileupSize; i++ ) { - if ( !itemsToRemove.get(i) ) - elementsToKeep.add(elements.get(i)); - } - - return elementsToKeep; - } - private static class SampleGenotypeData { public final String name; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4d9f37098..3c4a97ec1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -153,19 +153,19 @@ public class UnifiedGenotyperEngine { } - /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. - * - * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype - * for every sample in allSamples. If it's null there's no such guarentee. Providing this - * argument is critical when the resulting calls will be written to a VCF file. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) - * @return the VariantCallContext object - */ + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 04a26672c..79962a3e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; +import java.io.PrintStream; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -187,11 +188,14 @@ public class PairHMMIndelErrorModel { final LinkedHashMap haplotypeMap, final ReferenceContext ref, final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, + final double downsamplingFraction, + final PrintStream downsamplingLog) { final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index bae38283d..22d249240 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -28,9 +28,11 @@ package org.broadinstitute.sting.utils.genotyper; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; +import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.*; @@ -41,6 +43,9 @@ public abstract class PerReadAlleleLikelihoodMap { protected List alleles; protected Map> likelihoodReadMap; + public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log); + public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log); + public void add(GATKSAMRecord read, Allele a, Double likelihood) { Map likelihoodMap; if (likelihoodReadMap.containsKey(read)){ @@ -118,8 +123,6 @@ public abstract class PerReadAlleleLikelihoodMap { return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL ); } - public abstract PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction); - public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() { final Class PerReadAlleleLikelihoodMapClass = GATKLiteUtils.getProtectedClassIfAvailable(PerReadAlleleLikelihoodMap.class); try { diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java index 7ac757783..7db818592 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java @@ -26,10 +26,11 @@ package org.broadinstitute.sting.utils.genotyper; import org.broadinstitute.sting.utils.classloader.PublicPackageSource; -import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; +import java.io.PrintStream; import java.util.*; public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodMap implements PublicPackageSource { @@ -40,5 +41,6 @@ public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodM } // not implemented in the standard version - public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) { return this; } - } + public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {} + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { return pileup; } +}