Changes to default behavior of UG: multi-allelic mode is always on; max number of alternate alleles to genotype is 3; alleles in the SNP model are ranked by their likelihood sum (Guillermo will do this for indels); SB is computed again.

This commit is contained in:
Eric Banks 2012-02-08 15:27:16 -05:00
parent 717cd4b912
commit 2f800b078c
6 changed files with 72 additions and 69 deletions

View File

@ -55,9 +55,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private final boolean getAlleleListFromVCF;
private boolean DEBUG = false;
private final boolean doMultiAllelicCalls;
private final boolean doMultiAllelicCalls = true;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private final int maxAlternateAlleles;
private PairHMMIndelErrorModel pairModel;
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
@ -88,8 +87,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;

View File

@ -43,13 +43,24 @@ import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private boolean ALLOW_MULTIPLE_ALLELES;
private final boolean useAlleleFromVCF;
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[4];
private final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele base;
public LikelihoodSum(Allele base) { this.base = base; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
// make sure the PL cache has been initialized with enough alleles
@ -69,7 +80,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
final boolean[] basesToUse = new boolean[4];
final byte refBase = ref.getBase();
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
@ -95,46 +105,40 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// find the alternate allele(s) that we should be using
if ( alternateAlleleToUse != null ) {
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true;
alleles.add(alternateAlleleToUse);
} else if ( useAlleleFromVCF ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() )
return null;
for ( Allele allele : vc.getAlternateAlleles() )
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
alleles.addAll(vc.getAlternateAlleles());
} else {
determineAlternateAlleles(basesToUse, refBase, GLs);
// how many alternate alleles are we using?
int alleleCounter = Utils.countSetBits(basesToUse);
alleles.addAll(determineAlternateAlleles(refBase, GLs));
// if there are no non-ref alleles...
if ( alleleCounter == 0 ) {
if ( alleles.size() == 1 ) {
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return builder.make();
// otherwise, choose any alternate allele (it doesn't really matter)
basesToUse[indexOfRefBase == 0 ? 1 : 0] = true;
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
}
}
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
final int numAltAlleles = Utils.countSetBits(basesToUse);
final int[] alleleOrdering = new int[numAltAlleles + 1];
alleleOrdering[0] = indexOfRefBase;
int alleleOrderingIndex = 1;
int numLikelihoods = 1;
for ( int i = 0; i < 4; i++ ) {
if ( i != indexOfRefBase && basesToUse[i] ) {
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
alleleOrdering[alleleOrderingIndex++] = i;
numLikelihoods += alleleOrderingIndex;
}
final int numAlleles = alleles.size();
final int numAltAlleles = numAlleles - 1;
final int[] alleleOrdering = new int[numAlleles];
int alleleOrderingIndex = 0;
int numLikelihoods = 0;
for ( Allele allele : alleles ) {
alleleOrdering[alleleOrderingIndex++] = BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0]);
numLikelihoods += alleleOrderingIndex;
}
builder.alleles(alleles);
@ -165,13 +169,14 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
return builder.genotypes(genotypes).make();
}
// fills in the allelesToUse array
protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List<SampleGenotypeData> sampleDataList) {
// determines the alleles to use
protected List<Allele> determineAlternateAlleles(final byte ref, final List<SampleGenotypeData> sampleDataList) {
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
final double[] likelihoodCounts = new double[4];
for ( int i = 0; i < 4; i++ )
likelihoodSums[i] = new LikelihoodSum(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
// based on the GLs, find the alternate alleles with the most probability
for ( SampleGenotypeData sampleData : sampleDataList ) {
@ -180,25 +185,21 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( PLindexOfBestGL != PLindexOfRef ) {
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
if ( alleles[0] != baseIndexOfRef )
likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
likelihoodSums[alleles[0]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
// don't double-count it
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
likelihoodSums[alleles[1]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
}
}
if ( ALLOW_MULTIPLE_ALLELES ) {
for ( int i = 0; i < 4; i++ ) {
if ( likelihoodCounts[i] > 0.0 ) {
allelesToUse[i] = true;
}
}
} else {
// set the non-ref base which has the maximum sum of non-ref GLs
final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts);
if ( likelihoodCounts[indexOfMax] > 0.0 )
allelesToUse[indexOfMax] = true;
Collections.sort(Arrays.asList(likelihoodSums));
final List<Allele> allelesToUse = new ArrayList<Allele>(3);
for ( LikelihoodSum sum : likelihoodSums ) {
if ( sum.sum > 0.0 )
allelesToUse.add(sum.base);
}
return allelesToUse;
}
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {

View File

@ -84,8 +84,8 @@ public class UnifiedArgumentCollection {
/**
* This argument is not enabled by default because it increases the runtime by an appreciable amount.
*/
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
public boolean COMPUTE_SLOD = false;
@Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;
/**
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
@ -103,21 +103,12 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
/**
* The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag
* will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental.
* For advanced users only.
*/
@Advanced
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
public boolean MULTI_ALLELIC = false;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
*/
@Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 5;
public int MAX_ALTERNATE_ALLELES = 3;
// indel-related arguments
/**
@ -168,7 +159,7 @@ public class UnifiedArgumentCollection {
uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode;
uac.OutputMode = OutputMode;
uac.COMPUTE_SLOD = COMPUTE_SLOD;
uac.NO_SLOD = NO_SLOD;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
@ -185,7 +176,6 @@ public class UnifiedArgumentCollection {
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
uac.MULTI_ALLELIC = MULTI_ALLELIC;
return uac;
}

View File

@ -240,7 +240,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
if ( UAC.COMPUTE_SLOD )
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));

View File

@ -407,7 +407,7 @@ public class UnifiedGenotyperEngine {
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) {
if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) {
//final boolean DEBUG_SLOD = false;
// the overall lod

View File

@ -15,9 +15,9 @@ import java.util.Map;
public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
//
@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("653172b43b19003d9f7df6dab21f4b09"));
Arrays.asList("9ab4e98ce437a1c5e1eee338de49ee7e"));
executeTest("test MultiSample Pilot1", spec);
}
@ -56,6 +56,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test SingleSample Pilot2", spec);
}
@Test
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("aabc4b3a312aba18b78e14750d8c8e62"));
executeTest("test Multiple SNP alleles", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing compressed output
@ -114,8 +122,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testCallingParameters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" );
e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" );
e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -125,6 +132,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
}
}
@Test
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6172d2f3d370132f4c57a26aa94c256e"));
executeTest("test SLOD", spec);
}
@Test
public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>();