diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 342e3a90e..a8cfcd563 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -20,7 +20,6 @@ public abstract class GenotypeCalculationModel implements Cloneable { public enum Model { JOINT_ESTIMATE, JOINT_ESTIMATE_EXPT_GL, - POOLED, INDELS } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index a2b6a074c..08bfab7d5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -69,9 +69,9 @@ public class GenotypeCalculationModelFactory { boolean useExptGenotypeLikelihoods = UAC.genotypeModel == JOINT_ESTIMATE_EXPT_GL; gcm = new DiploidGenotypeCalculationModel(useExptGenotypeLikelihoods); break; - case POOLED: - gcm = new PooledCalculationModel(); - break; +// case POOLED: +// gcm = new PooledCalculationModel(); +// break; case INDELS: gcm = new SimpleIndelCalculationModel(); break; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java deleted file mode 100755 index a5f0af1a3..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java +++ /dev/null @@ -1,296 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.*; - -import net.sf.samtools.SAMRecord; - -public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel { - protected static final String POOL_SAMPLE_NAME = "POOL"; - - private static FourBaseProbabilities fourBaseLikelihoods = null; - private static boolean USE_CACHE = true; - - /** - * - */ - protected PooledCalculationModel() {} - - /** - * - * @param ref reference base - * @param contexts alignment contexts - * @param contextType context type - */ - protected void initialize(char ref, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - super.initialize(ref, contexts, contextType); - - // todo -- move this code to a static initializer - - // prepare the four base vector calculator - if ( fourBaseLikelihoods == null ) - fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(baseModel, defaultPlatform); - - // setup the cache - if ( CACHE == null ) - makeCache(POOL_SIZE); - } - - protected int getNSamples(Map contexts) { - return POOL_SIZE; - } - - protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int nChromosomes, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - - StratifiedAlignmentContext context = contexts.get(POOL_SAMPLE_NAME); - ReadBackedPileup pileup = context.getContext(contextType).getBasePileup(); - - int refIndex = BaseUtils.simpleBaseToBaseIndex(ref); - int altIndex = BaseUtils.simpleBaseToBaseIndex(alt); - - double maxLikelihoodSeen = -1.0 * Double.MAX_VALUE; - int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest(); - - // for each minor allele frequency, calculate log10PofDgivenAFi - for (int frequency = 0; frequency <= nChromosomes; frequency++) { - - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() ) - continue; - - char base = (char)p.getBase(); - byte qual = p.getQual(); - - if ( qual > 0 && BaseUtils.simpleBaseToBaseIndex(base) != -1 ) - log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, p.getRead(), p.getOffset()); - } - - // an optimization to speed up the calculation: if we are beyond the local maximum such - // that subsequent likelihoods won't factor into the confidence score, just quit - if ( frequency > minAlleleFrequencyToTest && maxLikelihoodSeen - log10PofDgivenAFi[altIndex][frequency] > LOG10_OPTIMIZATION_EPSILON ) { - ignoreAlleleFrequenciesAboveI(frequency, nChromosomes, altIndex); - return; - } - - if ( log10PofDgivenAFi[altIndex][frequency] > maxLikelihoodSeen ) - maxLikelihoodSeen = log10PofDgivenAFi[altIndex][frequency]; - } - } - - // cache = BMM x PL x REF x ALT x base x QUAL x strand x F as i - static double[][][][][][][][] CACHE = null; - static int N_CACHED = 0; - private static void makeCache(int pool_size) { - CACHE = new double[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][BaseUtils.BASES.length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][2][2 * pool_size+1]; - } - - protected void setCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read, double val ) { - int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); - int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); - int i = refIndex; - int j = altIndex; - int k = BaseUtils.simpleBaseToBaseIndex(base); - int l = qual; - int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag()); - int f = nAltAlleles; - - N_CACHED++; - //System.out.printf("Setting cache value %d %d %d %d %d %d %d %d = %f [count = %d]%n", m, a, i, j, k, l, x, f, val, N_CACHED); - CACHE[m][a][i][j][k][l][x][f] = val; - } - - protected double getCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read ) { - int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); - int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); - int i = refIndex; - int j = altIndex; - int k = BaseUtils.simpleBaseToBaseIndex(base); - int l = qual; - int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag()); - int f = nAltAlleles; - //System.out.printf("Getting cache value %d %d %d %d %d %d %d %d%n", m, a, i, j, k, l, x, f); - - return CACHE[m][a][i][j][k][l][x][f]; - } - - private double calcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) { - double L; - - if ( USE_CACHE ) { - L = getCache(refIndex, altIndex, nAltAlleles, base, qual, read); - if ( L == 0.0 ) { - L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset); - setCache(refIndex, altIndex, nAltAlleles, base, qual, read, L); - } - } else { - L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset); - } - - return L; - } - - private double reallyCalcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) { - double f = (1.0 * nAltAlleles) / nChromosomes; - double POfRef = 1 - f; - double POfAlt = f; - - FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); - double POfBGivenRef = fbl.getLikelihood(refIndex); - double POfBGivenAlt = fbl.getLikelihood(altIndex); - double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt; - return Math.log10(P); - } - -/* protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { - AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); - ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); - - double log10L = 0.0; - - int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); - int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); - - int nChromosomes = 2 * getNSamples(contexts); - int nAltAlleles = (int)(f * nChromosomes); - int nRefAlleles = nChromosomes - nAltAlleles; - - double log10POfRef = log10OneMinusF[nAltAlleles]; - double log10POfAlt = log10F[nAltAlleles]; - double POfRef = Math.pow(10,log10POfRef); - double POfAlt = Math.pow(10,log10POfAlt); - - for (int i = 0; i < pileup.getReads().size(); i++) { - int offset = pileup.getOffsets().get(i); - - // ignore deletions - if ( offset == -1 ) - continue; - - SAMRecord read = pileup.getReads().get(i); - char base = (char)read.getReadBases()[offset]; - int bIndex = BaseUtils.simpleBaseToBaseIndex(base); - byte qual = read.getBaseQualities()[offset]; - - if ( qual > 0 && bIndex != -1 ) { - FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); - double POfBGivenRef = fbl.getLikelihood(refIndex); - double POfBGivenAlt = fbl.getLikelihood(altIndex); - double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt; - log10L += Math.log10(P); - } - } - - //if ( verboseWriter != null ) - // verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n", - // context.getContext(StratifiedContext.OVERALL).getLocation(), - // refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); - - return log10L; - }*/ - -/* protected double computeLog10PofDgivenAFi_V2(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { - AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); - ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); - - double log10L = 0.0; - - int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); - int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); - - int nChromosomes = 2 * getNSamples(contexts); - int nAltAlleles = (int)(f * nChromosomes); - int nRefAlleles = nChromosomes - nAltAlleles; - - double log10POfRef = log10OneMinusF[nAltAlleles]; - double log10POfAlt = log10F[nAltAlleles]; - - for (int i = 0; i < pileup.getReads().size(); i++) { - int offset = pileup.getOffsets().get(i); - - // ignore deletions - if ( offset == -1 ) - continue; - - SAMRecord read = pileup.getReads().get(i); - char base = (char)read.getReadBases()[offset]; - int bIndex = BaseUtils.simpleBaseToBaseIndex(base); - byte qual = read.getBaseQualities()[offset]; - - double log10POfNotB = Math.log10(QualityUtils.qualToErrorProb(qual)); - if ( qual > 0 && bIndex != -1 ) { - FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); - - double log10POfB = fbl.getLog10Likelihood(base); - - if ( bIndex == refIndex && nRefAlleles > 0 ) { - log10L += log10POfRef + log10POfB; - } else if ( bIndex == altIndex && nAltAlleles > 0) { - log10L += log10POfAlt + log10POfB; - } else { - //log10L += Math.min(log10POfRef + fbl.getLog10Likelihood(refIndex), log10POfAlt + fbl.getLog10Likelihood(altIndex)); - log10L += log10POfNotB; - } - } - } - - if ( verboseWriter != null ) - verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n", - context.getContext(StratifiedContext.OVERALL).getLocation(), - refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); - - return log10L; - } */ - -/* protected double computeLog10PofDgivenAFi_V1(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { - AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); - ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); - - double log10L = 0.0; - - int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); - int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); - - int nChromosomes = 2 * getNSamples(contexts); - int nAltAlleles = (int)(f * nChromosomes); - int nRefAlleles = nChromosomes - nAltAlleles; - - double log10POfRef = Math.log10(1 - f); - double log10POfAlt = Math.log10(f); - //double log10ChromChooseRef = Math.log10(Arithmetic.binomial(nChromosomes, nRefAlleles)); - //double log10ChromChooseAlt = Math.log10(Arithmetic.binomial(nChromosomes, nAltAlleles)); - - byte[] bases = pileup.getBases(); - byte[] quals = pileup.getQuals(); - - for ( int i = 0; i < bases.length; i++ ) { - int bIndex = BaseUtils.simpleBaseToBaseIndex((char)bases[i]); - byte qual = quals[i]; - if ( qual > 0 && bIndex != -1 ) { - double log10POfB = Math.log10(QualityUtils.qualToProb(qual)); - double log10POfNotB = Math.log10(QualityUtils.qualToErrorProb(qual)); - //System.out.printf("%f %f %f %d%n", log10L, log10POfB, log10POfNotB, qual); - - if ( bIndex == refIndex && nRefAlleles > 0 ) { - log10L += log10POfRef + log10POfB; - } else if ( bIndex == altIndex && nAltAlleles > 0) { - log10L += log10POfAlt + log10POfB; - } else { - log10L += log10POfNotB; - } - } - } - - if ( verboseWriter != null ) - verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n", - context.getContext(StratifiedContext.OVERALL).getLocation(), - refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); - - return log10L; - }*/ -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9e51d9afa..a322b06b3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument; public class UnifiedArgumentCollection { // control the various models to be used - @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default, while POOLED is also available.", required = false) + @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default.", required = false) public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.JOINT_ESTIMATE; @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) @@ -40,7 +40,7 @@ public class UnifiedArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY; - @Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool (for POOLED model only)", required = false) + @Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool -- no longer in use", required = false) public int POOLSIZE = 0; // control the output diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index feb2eeac4..85cf55286 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -154,11 +154,11 @@ public class UnifiedGenotyper extends LocusWalker args = new HashSet(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 9bd07494d..f293df99e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -93,32 +93,20 @@ public class UnifiedGenotyperEngine { if ( UAC.genotypeModel == GenotypeCalculationModel.Model.INDELS && !(genotypeWriter instanceof VCFGenotypeWriter) ) { throw new IllegalArgumentException("Attempting to use an output format other than VCF with indels. Please set the output format to VCF."); } - if ( UAC.POOLSIZE > 0 && UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) { - throw new IllegalArgumentException("Attempting to use a model other than POOLED with pooled data. Please set the model to POOLED."); - } - if ( UAC.POOLSIZE < 1 && UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ) { - throw new IllegalArgumentException("Attempting to use the POOLED model with a pool size less than 1. Please set the pool size to an appropriate value."); + if ( UAC.POOLSIZE > 0 ) { + throw new IllegalArgumentException("Attempting to provide depreciated POOLSIZE parameter for retired POOLED model"); } if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) { // the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads"); } - // get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names - if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) { - // if we're supposed to assume a single sample, do so - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - this.samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); - } - - // in pooled mode we need to check that the format is acceptable - if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED && genotypeWriter != null ) { - // when using VCF with multiple threads, we need to turn down the validation stringency so that writing temporary files will work - if ( toolkit.getArguments().numberOfThreads > 1 && genotypeWriter instanceof VCFGenotypeWriter ) - ((VCFGenotypeWriter)genotypeWriter).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT); - } + // get all of the unique sample names + // if we're supposed to assume a single sample, do so + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + this.samples.add(UAC.ASSUME_SINGLE_SAMPLE); + else + this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); // check to see whether a dbsnp rod was included List dataSources = toolkit.getRodDataSources(); @@ -215,7 +203,7 @@ public class UnifiedGenotyperEngine { // stratify the AlignmentContext and cut by sample // Note that for testing purposes, we may want to throw multi-samples at pooled mode - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, (UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null)); + Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, null); if ( stratifiedContexts == null ) return null; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9dd6278f7..d81b0034c 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -18,13 +18,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // testing pooled model // // -------------------------------------------------------------------------------------------------------------- - @Test - public void testPooled1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, - Arrays.asList("c91f44a198cd7222520118726ea806ca")); - executeTest("testPooled1", spec); - } +// @Test +// public void testPooled1() { +// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( +// "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, +// Arrays.asList("c91f44a198cd7222520118726ea806ca")); +// executeTest("testPooled1", spec); +// } // -------------------------------------------------------------------------------------------------------------- //