From 2f800b078c07c1a6fc1dd5ed97282dc17c177e7e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 8 Feb 2012 15:27:16 -0500 Subject: [PATCH] Changes to default behavior of UG: multi-allelic mode is always on; max number of alternate alleles to genotype is 3; alleles in the SNP model are ranked by their likelihood sum (Guillermo will do this for indels); SB is computed again. --- ...elGenotypeLikelihoodsCalculationModel.java | 5 +- ...NPGenotypeLikelihoodsCalculationModel.java | 85 ++++++++++--------- .../genotyper/UnifiedArgumentCollection.java | 20 ++--- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 27 ++++-- 6 files changed, 72 insertions(+), 69 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 0422fbf03..49c131ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -55,9 +55,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls; + private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; - private final int maxAlternateAlleles; private PairHMMIndelErrorModel pairModel; private static ThreadLocal>> indelLikelihoodMap = @@ -88,8 +87,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES; - doMultiAllelicCalls = UAC.MULTI_ALLELIC; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index ea53c815d..6f1f86c6d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -43,13 +43,24 @@ import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private boolean ALLOW_MULTIPLE_ALLELES; - private final boolean useAlleleFromVCF; + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[4]; + + private final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele base; + + public LikelihoodSum(Allele base) { this.base = base; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; // make sure the PL cache has been initialized with enough alleles @@ -69,7 +80,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); - final boolean[] basesToUse = new boolean[4]; final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); @@ -95,46 +105,40 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // find the alternate allele(s) that we should be using if ( alternateAlleleToUse != null ) { - basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; + alleles.add(alternateAlleleToUse); } else if ( useAlleleFromVCF ) { final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a SNP if ( vc == null || !vc.isSNP() ) return null; - - for ( Allele allele : vc.getAlternateAlleles() ) - basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; + + alleles.addAll(vc.getAlternateAlleles()); } else { - determineAlternateAlleles(basesToUse, refBase, GLs); - - // how many alternate alleles are we using? - int alleleCounter = Utils.countSetBits(basesToUse); + alleles.addAll(determineAlternateAlleles(refBase, GLs)); // if there are no non-ref alleles... - if ( alleleCounter == 0 ) { + if ( alleles.size() == 1 ) { // if we only want variants, then we don't need to calculate genotype likelihoods if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) return builder.make(); // otherwise, choose any alternate allele (it doesn't really matter) - basesToUse[indexOfRefBase == 0 ? 1 : 0] = true; + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); } } // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) - final int numAltAlleles = Utils.countSetBits(basesToUse); - final int[] alleleOrdering = new int[numAltAlleles + 1]; - alleleOrdering[0] = indexOfRefBase; - int alleleOrderingIndex = 1; - int numLikelihoods = 1; - for ( int i = 0; i < 4; i++ ) { - if ( i != indexOfRefBase && basesToUse[i] ) { - alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); - alleleOrdering[alleleOrderingIndex++] = i; - numLikelihoods += alleleOrderingIndex; - } + final int numAlleles = alleles.size(); + final int numAltAlleles = numAlleles - 1; + + final int[] alleleOrdering = new int[numAlleles]; + int alleleOrderingIndex = 0; + int numLikelihoods = 0; + for ( Allele allele : alleles ) { + alleleOrdering[alleleOrderingIndex++] = BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0]); + numLikelihoods += alleleOrderingIndex; } builder.alleles(alleles); @@ -165,13 +169,14 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return builder.genotypes(genotypes).make(); } - - // fills in the allelesToUse array - protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List sampleDataList) { + + // determines the alleles to use + protected List determineAlternateAlleles(final byte ref, final List sampleDataList) { final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); - final double[] likelihoodCounts = new double[4]; + for ( int i = 0; i < 4; i++ ) + likelihoodSums[i] = new LikelihoodSum(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); // based on the GLs, find the alternate alleles with the most probability for ( SampleGenotypeData sampleData : sampleDataList ) { @@ -180,25 +185,21 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( PLindexOfBestGL != PLindexOfRef ) { int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; if ( alleles[0] != baseIndexOfRef ) - likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + likelihoodSums[alleles[0]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; // don't double-count it if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) - likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + likelihoodSums[alleles[1]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } - if ( ALLOW_MULTIPLE_ALLELES ) { - for ( int i = 0; i < 4; i++ ) { - if ( likelihoodCounts[i] > 0.0 ) { - allelesToUse[i] = true; - } - } - } else { - // set the non-ref base which has the maximum sum of non-ref GLs - final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts); - if ( likelihoodCounts[indexOfMax] > 0.0 ) - allelesToUse[indexOfMax] = true; + Collections.sort(Arrays.asList(likelihoodSums)); + final List allelesToUse = new ArrayList(3); + for ( LikelihoodSum sum : likelihoodSums ) { + if ( sum.sum > 0.0 ) + allelesToUse.add(sum.base); } + + return allelesToUse; } public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 16159393f..82e411c25 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -84,8 +84,8 @@ public class UnifiedArgumentCollection { /** * This argument is not enabled by default because it increases the runtime by an appreciable amount. */ - @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) - public boolean COMPUTE_SLOD = false; + @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) + public boolean NO_SLOD = false; /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding @@ -103,21 +103,12 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - /** - * The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag - * will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental. - * For advanced users only. - */ - @Advanced - @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false) - public boolean MULTI_ALLELIC = false; - /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. */ @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 5; + public int MAX_ALTERNATE_ALLELES = 3; // indel-related arguments /** @@ -168,7 +159,7 @@ public class UnifiedArgumentCollection { uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; - uac.COMPUTE_SLOD = COMPUTE_SLOD; + uac.NO_SLOD = NO_SLOD; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; @@ -185,7 +176,6 @@ public class UnifiedArgumentCollection { // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; - uac.MULTI_ALLELIC = MULTI_ALLELIC; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index b3f0954a2..1106fcb52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -240,7 +240,7 @@ public class UnifiedGenotyper extends LocusWalker e = new HashMap(); - e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); - e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" ); + e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -125,6 +132,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { } } + @Test + public void testSLOD() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + executeTest("test SLOD", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap();