Added allele biased down-sampling capabilities to the PerReadAlleleLikelihoodMap object, which means that both the UG and HC can use this functionality. Note that it's only available in protected, so GATK-lite users won't be allowed to enable it. Needs more testing.
This commit is contained in:
parent
9da7bbf689
commit
c6b57fffda
|
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class AlleleBiasedDownsamplingUtils {
|
||||
|
||||
private static final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
|
||||
static {
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes an allele biased version of the given pileup
|
||||
*
|
||||
* @param pileup the original pileup
|
||||
* @param downsamplingFraction the fraction of total reads to remove per allele
|
||||
* @param log logging output
|
||||
* @return allele biased pileup
|
||||
*/
|
||||
public synchronized static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
|
||||
// special case removal of all or no reads
|
||||
if ( downsamplingFraction <= 0.0 )
|
||||
return pileup;
|
||||
if ( downsamplingFraction >= 1.0 )
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
@Override
|
||||
public int compare(PileupElement element1, PileupElement element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() <= numReadsToRemove )
|
||||
logAllElements(alleleList, log);
|
||||
else
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
|
||||
*
|
||||
* @param elements original list of records
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @param log logging output
|
||||
* @return the list of pileup elements TO KEEP
|
||||
*/
|
||||
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
final int pileupSize = elements.size();
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) )
|
||||
logRead(elements.get(i).getRead(), log);
|
||||
else
|
||||
elementsToKeep.add(elements.get(i));
|
||||
}
|
||||
|
||||
return elementsToKeep;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes reads to remove based on an allele biased down-sampling
|
||||
*
|
||||
* @param alleleReadMap original list of records per allele
|
||||
* @param downsamplingFraction the fraction of total reads to remove per allele
|
||||
* @param log logging output
|
||||
* @return list of reads TO REMOVE from allele biased down-sampling
|
||||
*/
|
||||
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
|
||||
int totalReads = 0;
|
||||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
|
||||
totalReads += reads.size();
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
|
||||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
|
||||
if ( reads.size() <= numReadsToRemove ) {
|
||||
readsToRemove.addAll(reads);
|
||||
logAllReads(reads, log);
|
||||
} else {
|
||||
readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
|
||||
}
|
||||
}
|
||||
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
|
||||
*
|
||||
* @param reads original list of records
|
||||
* @param numElementsToRemove the number of records to remove
|
||||
* @param log logging output
|
||||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
|
||||
final int pileupSize = reads.size();
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final GATKSAMRecord read = reads.get(i);
|
||||
readsToRemove.add(read);
|
||||
logRead(read, log);
|
||||
}
|
||||
}
|
||||
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final PileupElement p : elements )
|
||||
logRead(p.getRead(), log);
|
||||
}
|
||||
}
|
||||
|
||||
private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
logRead(read, log);
|
||||
}
|
||||
}
|
||||
|
||||
private static void logRead(final SAMRecord read, final PrintStream log) {
|
||||
if ( log != null ) {
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
|
@ -59,7 +60,7 @@ public class ErrorModel {
|
|||
|
||||
boolean hasCalledAlleles = false;
|
||||
|
||||
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
if (refSampleVC != null) {
|
||||
|
||||
for (Allele allele : refSampleVC.getAlleles()) {
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -424,7 +425,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
|
|
@ -346,7 +347,9 @@ public class LikelihoodCalculationEngine {
|
|||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
|
|
@ -370,6 +373,9 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
}
|
||||
|
||||
// down-sample before adding filtered reads
|
||||
likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
|
|
|
|||
|
|
@ -25,16 +25,43 @@
|
|||
package org.broadinstitute.sting.utils.genotyper;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class AdvancedPerReadAlleleLikelihoodMap extends StandardPerReadAlleleLikelihoodMap implements ProtectedPackageSource {
|
||||
|
||||
public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) {
|
||||
return this;
|
||||
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
|
||||
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
|
||||
}
|
||||
}
|
||||
|
||||
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
|
||||
// special case removal of all or no reads
|
||||
if ( downsamplingFraction <= 0.0 )
|
||||
return;
|
||||
if ( downsamplingFraction >= 1.0 ) {
|
||||
likelihoodReadMap.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
|
||||
for ( Allele allele : alleles )
|
||||
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
|
||||
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
final Allele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
alleleReadMap.get(bestAllele).add(entry.getKey());
|
||||
}
|
||||
|
||||
// compute the reads to remove and actually remove them
|
||||
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
|
||||
for ( final GATKSAMRecord read : readsToRemove )
|
||||
likelihoodReadMap.remove(read);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
|||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
|
|
@ -63,26 +64,23 @@ public class StandardCallerArgumentCollection {
|
|||
public int MAX_ALTERNATE_ALLELES = 6;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*
|
||||
* This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on
|
||||
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
|
||||
*/
|
||||
@Deprecated
|
||||
@Hidden
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1;
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
|
||||
|
||||
/**
|
||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
|
||||
* will try to remove (N * contamination fraction) bases for each alternate allele.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
|
||||
public double CONTAMINATION_PERCENTAGE = 0.0;
|
||||
public double CONTAMINATION_FRACTION = 0.0;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logRemovedReadsFromContaminationFiltering", doc="contaminationLog", required=false)
|
||||
public PrintStream contaminationLog = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
|
|
@ -96,19 +94,12 @@ public class StandardCallerArgumentCollection {
|
|||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE;
|
||||
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
|
||||
this.contaminationLog = SCAC.contaminationLog;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.AFmodel = SCAC.AFmodel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
|
@ -81,7 +82,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser,
|
||||
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
|
||||
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
// if (!ref.getLocus().equals(lastSiteVisited)) {
|
||||
|
|
@ -118,12 +119,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
|
||||
// no likelihoods have been computed for this sample at this site
|
||||
perReadAlleleLikelihoodMap.put(sample.getKey(), org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
|
||||
perReadAlleleLikelihoodMap.put(sample.getKey(), PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
|
||||
}
|
||||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
if (pileup != null) {
|
||||
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()));
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.CONTAMINATION_FRACTION, UAC.contaminationLog);
|
||||
b.PL(genotypeLikelihoods);
|
||||
b.DP(getFilteredDepth(pileup));
|
||||
genotypes.add(b.make());
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -48,13 +49,13 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
private final boolean useAlleleFromVCF;
|
||||
|
||||
private final double[] likelihoodSums = new double[4];
|
||||
private final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
|
||||
|
||||
private final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
|
|
@ -64,9 +65,9 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser,
|
||||
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> sampleLikelihoodMap) {
|
||||
|
||||
perReadAlleleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data
|
||||
sampleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data
|
||||
|
||||
final byte refBase = ref.getBase();
|
||||
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
|
@ -79,8 +80,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 )
|
||||
pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE);
|
||||
if ( UAC.CONTAMINATION_FRACTION > 0.0 )
|
||||
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, UAC.CONTAMINATION_FRACTION, UAC.contaminationLog);
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup(pileup);
|
||||
|
||||
|
|
@ -203,42 +204,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
return allelesToUse;
|
||||
}
|
||||
|
||||
public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) {
|
||||
// special case removal of all reads
|
||||
if ( contaminationPercentage >= 1.0 )
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage);
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
@Override
|
||||
public int compare(PileupElement element1, PileupElement element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() > numReadsToRemove )
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
|
|
@ -257,22 +222,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
private List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove) {
|
||||
final int pileupSize = elements.size();
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( !itemsToRemove.get(i) )
|
||||
elementsToKeep.add(elements.get(i));
|
||||
}
|
||||
|
||||
return elementsToKeep;
|
||||
}
|
||||
|
||||
private static class SampleGenotypeData {
|
||||
|
||||
public final String name;
|
||||
|
|
|
|||
|
|
@ -153,19 +153,19 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
|
||||
*
|
||||
* If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
|
||||
* for every sample in allSamples. If it's null there's no such guarentee. Providing this
|
||||
* argument is critical when the resulting calls will be written to a VCF file.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
/**
|
||||
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
|
||||
*
|
||||
* If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
|
||||
* for every sample in allSamples. If it's null there's no such guarentee. Providing this
|
||||
* argument is critical when the resulting calls will be written to a VCF file.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
final AlignmentContext rawContext,
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
|
@ -187,11 +188,14 @@ public class PairHMMIndelErrorModel {
|
|||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext ref,
|
||||
final int eventLength,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,9 +28,11 @@ package org.broadinstitute.sting.utils.genotyper;
|
|||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -41,6 +43,9 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord,Map<Allele,Double>> likelihoodReadMap;
|
||||
|
||||
public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log);
|
||||
public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log);
|
||||
|
||||
public void add(GATKSAMRecord read, Allele a, Double likelihood) {
|
||||
Map<Allele,Double> likelihoodMap;
|
||||
if (likelihoodReadMap.containsKey(read)){
|
||||
|
|
@ -118,8 +123,6 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
|
||||
}
|
||||
|
||||
public abstract PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction);
|
||||
|
||||
public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {
|
||||
final Class PerReadAlleleLikelihoodMapClass = GATKLiteUtils.getProtectedClassIfAvailable(PerReadAlleleLikelihoodMap.class);
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -26,10 +26,11 @@ package org.broadinstitute.sting.utils.genotyper;
|
|||
|
||||
|
||||
import org.broadinstitute.sting.utils.classloader.PublicPackageSource;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodMap implements PublicPackageSource {
|
||||
|
|
@ -40,5 +41,6 @@ public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodM
|
|||
}
|
||||
|
||||
// not implemented in the standard version
|
||||
public PerReadAlleleLikelihoodMap createPerAlleleDownsampledMap(final double downsamplingFraction) { return this; }
|
||||
}
|
||||
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {}
|
||||
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { return pileup; }
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue