From b8ab77c91c3689a06f9bc5d77a53445aa6428cf3 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 20 Oct 2009 01:30:53 +0000 Subject: [PATCH] Don't filter out reads without proper read groups. Instead, allow the user (or another walker calling UG) to specify an assumed sample to use (but then we assume single-sample mode). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1883 348d0f76-0448-11de-a6fe-93d51630548a --- .../AllMAFsGenotypeCalculationModel.java | 131 +++++++++++++++--- .../genotyper/EMGenotypeCalculationModel.java | 10 +- .../genotyper/GenotypeCalculationModel.java | 7 + .../genotyper/UnifiedArgumentCollection.java | 5 + .../walkers/genotyper/UnifiedGenotyper.java | 23 ++- 5 files changed, 151 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java index 784b49dea..b7404b8c5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java @@ -1,8 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.*; import java.util.*; @@ -10,33 +10,69 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel protected AllMAFsGenotypeCalculationModel() {} - private double[] alleleFrequencies; + // because the null allele frequencies are constant for a given N, + // we cache the results to avoid having to recompute everything + private HashMap nullAlleleFrequencyCache = new HashMap(); + // the allele frequencies + private double[][] alleleFrequencies = new double[3][]; + private double[][] oldAlleleFrequencies; + + // keep track of whether or not a given MAF is stable + private boolean[] frequencyStabilityArray = new boolean[3]; + + // the minimum and actual number of points in our allele frequency estimation + private static final int MIN_ESTIMATION_POINTS = 1000; + private int estimationPoints; + + // the GenotypeLikelihoods map + private HashMap GLs = new HashMap(); protected void initializeAlleleFrequencies(int numSamples, char ref) { - // we have 2N possible allele frequencies in pileup - int possibleMAFs = 2 * numSamples; - alleleFrequencies = new double[possibleMAFs]; + // first, initialize the stability array to "unstable" + for (int i = 0; i < 3; i++) + frequencyStabilityArray[i] = false; - // calculate sum(1/i) for i from 1 to 2N - double denominator = 0.0; - for (int i = 1; i <= possibleMAFs; i++) - denominator += 1.0 / (double)i; + // calculate the number of estimation points to use: + // it's either MIN_ESTIMATION_POINTS or 2N if that's larger + // (add 1 for allele frequency of zero) + estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1; - // set up delta - double delta = 1.0 / denominator; + for (int alt = 0; alt < 3; alt++) + alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints); - // calculate the null allele frequencies - for (int i = 1; i <= possibleMAFs; i++) - alleleFrequencies[i-1] = Math.log10(delta / (double)i); + for (int i = 1; i < estimationPoints; i++) + logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]); + } - for (int i = 0; i < possibleMAFs; i++) - logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]); + private double[] getNullAlleleFrequencies(int N) { + double[] AFs = nullAlleleFrequencyCache.get(N); + + // if it hasn't been calculated yet, do so now + if ( AFs == null ) { + + // calculate sum(1/i) + double denominator = 0.0; + for (int i = 1; i < N; i++) + denominator += 1.0 / (double)i; + + // set up delta + double delta = 1.0 / denominator; + + // calculate the null allele frequencies + AFs = new double[N]; + for (int i = 1; i < N; i++) + AFs[i] = Math.log10(delta / (double)i); + + nullAlleleFrequencyCache.put(N, AFs); + } + + return AFs.clone(); } protected void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { - HashMap GLs = new HashMap(); + GLs.clear(); for ( String sample : contexts.keySet() ) { AlignmentContextBySample context = contexts.get(sample); @@ -47,7 +83,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel GL.setVerbose(VERBOSE); GL.add(pileup, true); - GLs.put(sample, GL); + GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL)); } } @@ -60,10 +96,67 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel } protected boolean isStable() { + // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC + // We compute this separately for all of the alternate alleles + for (int i = 0; i < 3; i++) { + // if we've already determined that a MAF is stable, don't recalculate + if ( frequencyStabilityArray[i] ) + continue; + + // determine change + double AF_delta = 0.0; + for (int j = 1; j < estimationPoints; j++) + AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]); + + // if it's not stable, we're done + if (AF_delta > EM_STABILITY_METRIC) + return false; + + // it's stable, so record that fact in the stability array + frequencyStabilityArray[i] = true; + } + + // if we got here, then we're stable return true; } protected EMOutput computePofF(char ref) { return null; } + + + /** + * A class for the allele-specific genotype likelihoods + */ + protected class AlleleSpecificGenotypeLikelihoods { + private HashMap> GLs = new HashMap>(); + + AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) { + double[] likelihoods = GL.getLikelihoods(); + double[] posteriors = GL.getPosteriors(); + + + + // get the ref likelihood/posterior + DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); + double refLikelihood = GL.getLikelihood(refGenotype); + double refPosterior = GL.getPosterior(refGenotype); + String refStr = String.valueOf(ref); + + for ( char base : BaseUtils.BASES ) { + if ( base == ref ) + continue; + + // get hom var and het likelihoods + double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base)); + double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base))); + + + } + } + + + + + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index b65636b02..c319a61ba 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; @@ -130,7 +131,14 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode if ( POOLED_INPUT ) { sample = "POOL"; } else { - sample = read.getReadGroup().getSample(); + SAMReadGroupRecord readGroup = read.getReadGroup(); + if ( readGroup == null ) { + if ( assumedSingleSample == null ) + throw new StingException("Missing read group for read " + read.getReadName()); + sample = assumedSingleSample; + } else { + sample = readGroup.getSample(); + } } // create a new context object if this is the first time we're seeing a read for this sample diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index e809d940c..94ce2a55f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -29,6 +29,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected int POOL_SIZE; protected double LOD_THRESHOLD; protected int maxDeletionsInPileup; + protected String assumedSingleSample; protected boolean VERBOSE; /** @@ -57,6 +58,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { POOL_SIZE = UAC.POOLSIZE; LOD_THRESHOLD = UAC.LOD_THRESHOLD; maxDeletionsInPileup = UAC.MAX_DELETIONS; + assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE; VERBOSE = UAC.VERBOSE; } @@ -76,10 +78,15 @@ public abstract class GenotypeCalculationModel implements Cloneable { gcm.POOLED_INPUT = POOLED_INPUT; gcm.LOD_THRESHOLD = LOD_THRESHOLD; gcm.maxDeletionsInPileup = maxDeletionsInPileup; + gcm.assumedSingleSample = assumedSingleSample; gcm.VERBOSE = VERBOSE; return gcm; } + public void setAssumedSingleSample(String sample) { + assumedSingleSample = sample; + } + /** * Must be overridden by concrete subclasses * @param tracker rod data diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 99be73636..16715962b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -55,6 +55,11 @@ public class UnifiedArgumentCollection { public boolean VERBOSE = false; + // control the error modes + @Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) + public String ASSUME_SINGLE_SAMPLE = null; + + // control the various parameters to be used @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false) public double LOD_THRESHOLD = Double.MIN_VALUE; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 610ff6359..457849b58 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; @@ -52,7 +51,7 @@ import java.util.List; import java.util.ArrayList; -@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class}) +@ReadFilters({ZeroMappingQualityReadFilter.class}) public class UnifiedGenotyper extends LocusWalker, GenotypeMetaData>, Integer> { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -83,6 +82,15 @@ public class UnifiedGenotyper extends LocusWalker, Genot /** Enable deletions in the pileup **/ public boolean includeReadsWithDeletionAtLoci() { return true; } + /** + * Sets the single sample to assume when read groups are missing. + * To be used with walkers that call the UnifiedGenotyper's map function + * + **/ + public void setAssumedSingleSample(String sample) { + gcm.setAssumedSingleSample(sample); + } + /** * Initialize the samples, output, and genotype calculation model * @@ -95,9 +103,14 @@ public class UnifiedGenotyper extends LocusWalker, Genot // get all of the unique sample names samples = new HashSet(); - List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); - for ( SAMReadGroupRecord readGroup : readGroups ) - samples.add(readGroup.getSample()); + // if we're supposed to assume a single sample + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) { + samples.add(UAC.ASSUME_SINGLE_SAMPLE); + } else { + List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + for ( SAMReadGroupRecord readGroup : readGroups ) + samples.add(readGroup.getSample()); + } // print them out for debugging (need separate loop to ensure uniqueness) for ( String sample : samples )