Deleting the pooled calcluation model -- no longer supported.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3088 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-29 11:44:27 +00:00
parent 85037ab13f
commit 8ea98faf47
7 changed files with 26 additions and 335 deletions

View File

@ -20,7 +20,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
public enum Model {
JOINT_ESTIMATE,
JOINT_ESTIMATE_EXPT_GL,
POOLED,
INDELS
}

View File

@ -69,9 +69,9 @@ public class GenotypeCalculationModelFactory {
boolean useExptGenotypeLikelihoods = UAC.genotypeModel == JOINT_ESTIMATE_EXPT_GL;
gcm = new DiploidGenotypeCalculationModel(useExptGenotypeLikelihoods);
break;
case POOLED:
gcm = new PooledCalculationModel();
break;
// case POOLED:
// gcm = new PooledCalculationModel();
// break;
case INDELS:
gcm = new SimpleIndelCalculationModel();
break;

View File

@ -1,296 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
import net.sf.samtools.SAMRecord;
public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel {
protected static final String POOL_SAMPLE_NAME = "POOL";
private static FourBaseProbabilities fourBaseLikelihoods = null;
private static boolean USE_CACHE = true;
/**
*
*/
protected PooledCalculationModel() {}
/**
*
* @param ref reference base
* @param contexts alignment contexts
* @param contextType context type
*/
protected void initialize(char ref, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
super.initialize(ref, contexts, contextType);
// todo -- move this code to a static initializer
// prepare the four base vector calculator
if ( fourBaseLikelihoods == null )
fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(baseModel, defaultPlatform);
// setup the cache
if ( CACHE == null )
makeCache(POOL_SIZE);
}
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
return POOL_SIZE;
}
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int nChromosomes, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
StratifiedAlignmentContext context = contexts.get(POOL_SAMPLE_NAME);
ReadBackedPileup pileup = context.getContext(contextType).getBasePileup();
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref);
int altIndex = BaseUtils.simpleBaseToBaseIndex(alt);
double maxLikelihoodSeen = -1.0 * Double.MAX_VALUE;
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int frequency = 0; frequency <= nChromosomes; frequency++) {
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
char base = (char)p.getBase();
byte qual = p.getQual();
if ( qual > 0 && BaseUtils.simpleBaseToBaseIndex(base) != -1 )
log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, p.getRead(), p.getOffset());
}
// an optimization to speed up the calculation: if we are beyond the local maximum such
// that subsequent likelihoods won't factor into the confidence score, just quit
if ( frequency > minAlleleFrequencyToTest && maxLikelihoodSeen - log10PofDgivenAFi[altIndex][frequency] > LOG10_OPTIMIZATION_EPSILON ) {
ignoreAlleleFrequenciesAboveI(frequency, nChromosomes, altIndex);
return;
}
if ( log10PofDgivenAFi[altIndex][frequency] > maxLikelihoodSeen )
maxLikelihoodSeen = log10PofDgivenAFi[altIndex][frequency];
}
}
// cache = BMM x PL x REF x ALT x base x QUAL x strand x F as i
static double[][][][][][][][] CACHE = null;
static int N_CACHED = 0;
private static void makeCache(int pool_size) {
CACHE = new double[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][BaseUtils.BASES.length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][2][2 * pool_size+1];
}
protected void setCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read, double val ) {
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = refIndex;
int j = altIndex;
int k = BaseUtils.simpleBaseToBaseIndex(base);
int l = qual;
int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag());
int f = nAltAlleles;
N_CACHED++;
//System.out.printf("Setting cache value %d %d %d %d %d %d %d %d = %f [count = %d]%n", m, a, i, j, k, l, x, f, val, N_CACHED);
CACHE[m][a][i][j][k][l][x][f] = val;
}
protected double getCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read ) {
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = refIndex;
int j = altIndex;
int k = BaseUtils.simpleBaseToBaseIndex(base);
int l = qual;
int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag());
int f = nAltAlleles;
//System.out.printf("Getting cache value %d %d %d %d %d %d %d %d%n", m, a, i, j, k, l, x, f);
return CACHE[m][a][i][j][k][l][x][f];
}
private double calcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) {
double L;
if ( USE_CACHE ) {
L = getCache(refIndex, altIndex, nAltAlleles, base, qual, read);
if ( L == 0.0 ) {
L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset);
setCache(refIndex, altIndex, nAltAlleles, base, qual, read, L);
}
} else {
L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset);
}
return L;
}
private double reallyCalcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) {
double f = (1.0 * nAltAlleles) / nChromosomes;
double POfRef = 1 - f;
double POfAlt = f;
FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset);
double POfBGivenRef = fbl.getLikelihood(refIndex);
double POfBGivenAlt = fbl.getLikelihood(altIndex);
double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt;
return Math.log10(P);
}
/* protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);
ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType));
double log10L = 0.0;
int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg);
int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg);
int nChromosomes = 2 * getNSamples(contexts);
int nAltAlleles = (int)(f * nChromosomes);
int nRefAlleles = nChromosomes - nAltAlleles;
double log10POfRef = log10OneMinusF[nAltAlleles];
double log10POfAlt = log10F[nAltAlleles];
double POfRef = Math.pow(10,log10POfRef);
double POfAlt = Math.pow(10,log10POfAlt);
for (int i = 0; i < pileup.getReads().size(); i++) {
int offset = pileup.getOffsets().get(i);
// ignore deletions
if ( offset == -1 )
continue;
SAMRecord read = pileup.getReads().get(i);
char base = (char)read.getReadBases()[offset];
int bIndex = BaseUtils.simpleBaseToBaseIndex(base);
byte qual = read.getBaseQualities()[offset];
if ( qual > 0 && bIndex != -1 ) {
FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset);
double POfBGivenRef = fbl.getLikelihood(refIndex);
double POfBGivenAlt = fbl.getLikelihood(altIndex);
double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt;
log10L += Math.log10(P);
}
}
//if ( verboseWriter != null )
// verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n",
// context.getContext(StratifiedContext.OVERALL).getLocation(),
// refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L);
return log10L;
}*/
/* protected double computeLog10PofDgivenAFi_V2(char refArg, char altArg, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);
ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType));
double log10L = 0.0;
int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg);
int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg);
int nChromosomes = 2 * getNSamples(contexts);
int nAltAlleles = (int)(f * nChromosomes);
int nRefAlleles = nChromosomes - nAltAlleles;
double log10POfRef = log10OneMinusF[nAltAlleles];
double log10POfAlt = log10F[nAltAlleles];
for (int i = 0; i < pileup.getReads().size(); i++) {
int offset = pileup.getOffsets().get(i);
// ignore deletions
if ( offset == -1 )
continue;
SAMRecord read = pileup.getReads().get(i);
char base = (char)read.getReadBases()[offset];
int bIndex = BaseUtils.simpleBaseToBaseIndex(base);
byte qual = read.getBaseQualities()[offset];
double log10POfNotB = Math.log10(QualityUtils.qualToErrorProb(qual));
if ( qual > 0 && bIndex != -1 ) {
FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset);
double log10POfB = fbl.getLog10Likelihood(base);
if ( bIndex == refIndex && nRefAlleles > 0 ) {
log10L += log10POfRef + log10POfB;
} else if ( bIndex == altIndex && nAltAlleles > 0) {
log10L += log10POfAlt + log10POfB;
} else {
//log10L += Math.min(log10POfRef + fbl.getLog10Likelihood(refIndex), log10POfAlt + fbl.getLog10Likelihood(altIndex));
log10L += log10POfNotB;
}
}
}
if ( verboseWriter != null )
verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n",
context.getContext(StratifiedContext.OVERALL).getLocation(),
refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L);
return log10L;
} */
/* protected double computeLog10PofDgivenAFi_V1(char refArg, char altArg, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);
ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType));
double log10L = 0.0;
int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg);
int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg);
int nChromosomes = 2 * getNSamples(contexts);
int nAltAlleles = (int)(f * nChromosomes);
int nRefAlleles = nChromosomes - nAltAlleles;
double log10POfRef = Math.log10(1 - f);
double log10POfAlt = Math.log10(f);
//double log10ChromChooseRef = Math.log10(Arithmetic.binomial(nChromosomes, nRefAlleles));
//double log10ChromChooseAlt = Math.log10(Arithmetic.binomial(nChromosomes, nAltAlleles));
byte[] bases = pileup.getBases();
byte[] quals = pileup.getQuals();
for ( int i = 0; i < bases.length; i++ ) {
int bIndex = BaseUtils.simpleBaseToBaseIndex((char)bases[i]);
byte qual = quals[i];
if ( qual > 0 && bIndex != -1 ) {
double log10POfB = Math.log10(QualityUtils.qualToProb(qual));
double log10POfNotB = Math.log10(QualityUtils.qualToErrorProb(qual));
//System.out.printf("%f %f %f %d%n", log10L, log10POfB, log10POfNotB, qual);
if ( bIndex == refIndex && nRefAlleles > 0 ) {
log10L += log10POfRef + log10POfB;
} else if ( bIndex == altIndex && nAltAlleles > 0) {
log10L += log10POfAlt + log10POfB;
} else {
log10L += log10POfNotB;
}
}
}
if ( verboseWriter != null )
verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n",
context.getContext(StratifiedContext.OVERALL).getLocation(),
refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L);
return log10L;
}*/
}

View File

@ -31,7 +31,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default, while POOLED is also available.", required = false)
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default.", required = false)
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.JOINT_ESTIMATE;
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
@ -40,7 +40,7 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
@Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool (for POOLED model only)", required = false)
@Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool -- no longer in use", required = false)
public int POOLSIZE = 0;
// control the output

View File

@ -154,11 +154,11 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.STRAND_BIAS_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
// FORMAT and INFO fields if not in POOLED mode
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
headerInfo.addAll(VCFGenotypeRecord.getSupportedHeaderStrings());
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_COUNT_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_NUMBER_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total number of alleles in called genotypes"));
}
//if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
headerInfo.addAll(VCFGenotypeRecord.getSupportedHeaderStrings());
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_COUNT_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_NUMBER_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total number of alleles in called genotypes"));
//}
// all of the arguments from the argument collection
Set<Object> args = new HashSet<Object>();

View File

@ -93,32 +93,20 @@ public class UnifiedGenotyperEngine {
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.INDELS && !(genotypeWriter instanceof VCFGenotypeWriter) ) {
throw new IllegalArgumentException("Attempting to use an output format other than VCF with indels. Please set the output format to VCF.");
}
if ( UAC.POOLSIZE > 0 && UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
throw new IllegalArgumentException("Attempting to use a model other than POOLED with pooled data. Please set the model to POOLED.");
}
if ( UAC.POOLSIZE < 1 && UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ) {
throw new IllegalArgumentException("Attempting to use the POOLED model with a pool size less than 1. Please set the pool size to an appropriate value.");
if ( UAC.POOLSIZE > 0 ) {
throw new IllegalArgumentException("Attempting to provide depreciated POOLSIZE parameter for retired POOLED model");
}
if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) {
// the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads");
}
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
// if we're supposed to assume a single sample, do so
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
this.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
}
// in pooled mode we need to check that the format is acceptable
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED && genotypeWriter != null ) {
// when using VCF with multiple threads, we need to turn down the validation stringency so that writing temporary files will work
if ( toolkit.getArguments().numberOfThreads > 1 && genotypeWriter instanceof VCFGenotypeWriter )
((VCFGenotypeWriter)genotypeWriter).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT);
}
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
this.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
// check to see whether a dbsnp rod was included
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
@ -215,7 +203,7 @@ public class UnifiedGenotyperEngine {
// stratify the AlignmentContext and cut by sample
// Note that for testing purposes, we may want to throw multi-samples at pooled mode
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, (UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null));
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, null);
if ( stratifiedContexts == null )
return null;

View File

@ -18,13 +18,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// testing pooled model
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("c91f44a198cd7222520118726ea806ca"));
executeTest("testPooled1", spec);
}
// @Test
// public void testPooled1() {
// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
// "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
// Arrays.asList("c91f44a198cd7222520118726ea806ca"));
// executeTest("testPooled1", spec);
// }
// --------------------------------------------------------------------------------------------------------------
//