Changes to default behavior of UG: multi-allelic mode is always on; max number of alternate alleles to genotype is 3; alleles in the SNP model are ranked by their likelihood sum (Guillermo will do this for indels); SB is computed again.

This commit is contained in:
Eric Banks 2012-02-08 15:27:16 -05:00
parent 717cd4b912
commit 2f800b078c
6 changed files with 72 additions and 69 deletions

View File

@ -55,9 +55,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private final boolean getAlleleListFromVCF; private final boolean getAlleleListFromVCF;
private boolean DEBUG = false; private boolean DEBUG = false;
private final boolean doMultiAllelicCalls; private final boolean doMultiAllelicCalls = true;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private final int maxAlternateAlleles;
private PairHMMIndelErrorModel pairModel; private PairHMMIndelErrorModel pairModel;
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap = private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
@ -88,8 +87,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>(); haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;

View File

@ -43,13 +43,24 @@ import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private boolean ALLOW_MULTIPLE_ALLELES;
private final boolean useAlleleFromVCF; private final boolean useAlleleFromVCF;
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[4];
private final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele base;
public LikelihoodSum(Allele base) { this.base = base; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger); super(UAC, logger);
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
// make sure the PL cache has been initialized with enough alleles // make sure the PL cache has been initialized with enough alleles
@ -69,7 +80,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( !(priors instanceof DiploidSNPGenotypePriors) ) if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
final boolean[] basesToUse = new boolean[4];
final byte refBase = ref.getBase(); final byte refBase = ref.getBase();
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
@ -95,46 +105,40 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// find the alternate allele(s) that we should be using // find the alternate allele(s) that we should be using
if ( alternateAlleleToUse != null ) { if ( alternateAlleleToUse != null ) {
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; alleles.add(alternateAlleleToUse);
} else if ( useAlleleFromVCF ) { } else if ( useAlleleFromVCF ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a SNP // ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() ) if ( vc == null || !vc.isSNP() )
return null; return null;
for ( Allele allele : vc.getAlternateAlleles() ) alleles.addAll(vc.getAlternateAlleles());
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
} else { } else {
determineAlternateAlleles(basesToUse, refBase, GLs); alleles.addAll(determineAlternateAlleles(refBase, GLs));
// how many alternate alleles are we using?
int alleleCounter = Utils.countSetBits(basesToUse);
// if there are no non-ref alleles... // if there are no non-ref alleles...
if ( alleleCounter == 0 ) { if ( alleles.size() == 1 ) {
// if we only want variants, then we don't need to calculate genotype likelihoods // if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return builder.make(); return builder.make();
// otherwise, choose any alternate allele (it doesn't really matter) // otherwise, choose any alternate allele (it doesn't really matter)
basesToUse[indexOfRefBase == 0 ? 1 : 0] = true; alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
} }
} }
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
final int numAltAlleles = Utils.countSetBits(basesToUse); final int numAlleles = alleles.size();
final int[] alleleOrdering = new int[numAltAlleles + 1]; final int numAltAlleles = numAlleles - 1;
alleleOrdering[0] = indexOfRefBase;
int alleleOrderingIndex = 1; final int[] alleleOrdering = new int[numAlleles];
int numLikelihoods = 1; int alleleOrderingIndex = 0;
for ( int i = 0; i < 4; i++ ) { int numLikelihoods = 0;
if ( i != indexOfRefBase && basesToUse[i] ) { for ( Allele allele : alleles ) {
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); alleleOrdering[alleleOrderingIndex++] = BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0]);
alleleOrdering[alleleOrderingIndex++] = i; numLikelihoods += alleleOrderingIndex;
numLikelihoods += alleleOrderingIndex;
}
} }
builder.alleles(alleles); builder.alleles(alleles);
@ -165,13 +169,14 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
return builder.genotypes(genotypes).make(); return builder.genotypes(genotypes).make();
} }
// fills in the allelesToUse array // determines the alleles to use
protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List<SampleGenotypeData> sampleDataList) { protected List<Allele> determineAlternateAlleles(final byte ref, final List<SampleGenotypeData> sampleDataList) {
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
final double[] likelihoodCounts = new double[4]; for ( int i = 0; i < 4; i++ )
likelihoodSums[i] = new LikelihoodSum(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
// based on the GLs, find the alternate alleles with the most probability // based on the GLs, find the alternate alleles with the most probability
for ( SampleGenotypeData sampleData : sampleDataList ) { for ( SampleGenotypeData sampleData : sampleDataList ) {
@ -180,25 +185,21 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( PLindexOfBestGL != PLindexOfRef ) { if ( PLindexOfBestGL != PLindexOfRef ) {
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
if ( alleles[0] != baseIndexOfRef ) if ( alleles[0] != baseIndexOfRef )
likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; likelihoodSums[alleles[0]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
// don't double-count it // don't double-count it
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; likelihoodSums[alleles[1]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
} }
} }
if ( ALLOW_MULTIPLE_ALLELES ) { Collections.sort(Arrays.asList(likelihoodSums));
for ( int i = 0; i < 4; i++ ) { final List<Allele> allelesToUse = new ArrayList<Allele>(3);
if ( likelihoodCounts[i] > 0.0 ) { for ( LikelihoodSum sum : likelihoodSums ) {
allelesToUse[i] = true; if ( sum.sum > 0.0 )
} allelesToUse.add(sum.base);
}
} else {
// set the non-ref base which has the maximum sum of non-ref GLs
final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts);
if ( likelihoodCounts[indexOfMax] > 0.0 )
allelesToUse[indexOfMax] = true;
} }
return allelesToUse;
} }
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {

View File

@ -84,8 +84,8 @@ public class UnifiedArgumentCollection {
/** /**
* This argument is not enabled by default because it increases the runtime by an appreciable amount. * This argument is not enabled by default because it increases the runtime by an appreciable amount.
*/ */
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean COMPUTE_SLOD = false; public boolean NO_SLOD = false;
/** /**
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
@ -103,21 +103,12 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05; public Double MAX_DELETION_FRACTION = 0.05;
/**
* The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag
* will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental.
* For advanced users only.
*/
@Advanced
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
public boolean MULTI_ALLELIC = false;
/** /**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
*/ */
@Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 5; public int MAX_ALTERNATE_ALLELES = 3;
// indel-related arguments // indel-related arguments
/** /**
@ -168,7 +159,7 @@ public class UnifiedArgumentCollection {
uac.PCR_error = PCR_error; uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode; uac.GenotypingMode = GenotypingMode;
uac.OutputMode = OutputMode; uac.OutputMode = OutputMode;
uac.COMPUTE_SLOD = COMPUTE_SLOD; uac.NO_SLOD = NO_SLOD;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
@ -185,7 +176,6 @@ public class UnifiedArgumentCollection {
// todo- arguments to remove // todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
uac.MULTI_ALLELIC = MULTI_ALLELIC;
return uac; return uac;
} }

View File

@ -240,7 +240,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper // annotation (INFO) fields from UnifiedGenotyper
if ( UAC.COMPUTE_SLOD ) if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));

View File

@ -407,7 +407,7 @@ public class UnifiedGenotyperEngine {
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) {
//final boolean DEBUG_SLOD = false; //final boolean DEBUG_SLOD = false;
// the overall lod // the overall lod

View File

@ -15,9 +15,9 @@ import java.util.Map;
public class UnifiedGenotyperIntegrationTest extends WalkerTest { public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("653172b43b19003d9f7df6dab21f4b09")); Arrays.asList("9ab4e98ce437a1c5e1eee338de49ee7e"));
executeTest("test MultiSample Pilot1", spec); executeTest("test MultiSample Pilot1", spec);
} }
@ -56,6 +56,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test SingleSample Pilot2", spec); executeTest("test SingleSample Pilot2", spec);
} }
@Test
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("aabc4b3a312aba18b78e14750d8c8e62"));
executeTest("test Multiple SNP alleles", spec);
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// testing compressed output // testing compressed output
@ -114,8 +122,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test @Test
public void testCallingParameters() { public void testCallingParameters() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" );
e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -125,6 +132,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
} }
} }
@Test
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6172d2f3d370132f4c57a26aa94c256e"));
executeTest("test SLOD", spec);
}
@Test @Test
public void testOutputParameter() { public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();