Don't filter out reads without proper read groups. Instead, allow the user (or another walker calling UG) to specify an assumed sample to use (but then we assume single-sample mode).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1883 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-20 01:30:53 +00:00
parent a8a2c1a2a1
commit b8ab77c91c
5 changed files with 151 additions and 25 deletions

View File

@ -1,8 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;
@ -10,33 +10,69 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
protected AllMAFsGenotypeCalculationModel() {}
private double[] alleleFrequencies;
// because the null allele frequencies are constant for a given N,
// we cache the results to avoid having to recompute everything
private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();
// the allele frequencies
private double[][] alleleFrequencies = new double[3][];
private double[][] oldAlleleFrequencies;
// keep track of whether or not a given MAF is stable
private boolean[] frequencyStabilityArray = new boolean[3];
// the minimum and actual number of points in our allele frequency estimation
private static final int MIN_ESTIMATION_POINTS = 1000;
private int estimationPoints;
// the GenotypeLikelihoods map
private HashMap<String, AlleleSpecificGenotypeLikelihoods> GLs = new HashMap<String, AlleleSpecificGenotypeLikelihoods>();
protected void initializeAlleleFrequencies(int numSamples, char ref) {
// we have 2N possible allele frequencies in pileup
int possibleMAFs = 2 * numSamples;
alleleFrequencies = new double[possibleMAFs];
// first, initialize the stability array to "unstable"
for (int i = 0; i < 3; i++)
frequencyStabilityArray[i] = false;
// calculate sum(1/i) for i from 1 to 2N
double denominator = 0.0;
for (int i = 1; i <= possibleMAFs; i++)
denominator += 1.0 / (double)i;
// calculate the number of estimation points to use:
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
// (add 1 for allele frequency of zero)
estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
// set up delta
double delta = 1.0 / denominator;
for (int alt = 0; alt < 3; alt++)
alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints);
// calculate the null allele frequencies
for (int i = 1; i <= possibleMAFs; i++)
alleleFrequencies[i-1] = Math.log10(delta / (double)i);
for (int i = 1; i < estimationPoints; i++)
logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]);
}
for (int i = 0; i < possibleMAFs; i++)
logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
private double[] getNullAlleleFrequencies(int N) {
double[] AFs = nullAlleleFrequencyCache.get(N);
// if it hasn't been calculated yet, do so now
if ( AFs == null ) {
// calculate sum(1/i)
double denominator = 0.0;
for (int i = 1; i < N; i++)
denominator += 1.0 / (double)i;
// set up delta
double delta = 1.0 / denominator;
// calculate the null allele frequencies
AFs = new double[N];
for (int i = 1; i < N; i++)
AFs[i] = Math.log10(delta / (double)i);
nullAlleleFrequencyCache.put(N, AFs);
}
return AFs.clone();
}
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
GLs.clear();
for ( String sample : contexts.keySet() ) {
AlignmentContextBySample context = contexts.get(sample);
@ -47,7 +83,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
GL.setVerbose(VERBOSE);
GL.add(pileup, true);
GLs.put(sample, GL);
GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL));
}
}
@ -60,10 +96,67 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
}
protected boolean isStable() {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
// We compute this separately for all of the alternate alleles
for (int i = 0; i < 3; i++) {
// if we've already determined that a MAF is stable, don't recalculate
if ( frequencyStabilityArray[i] )
continue;
// determine change
double AF_delta = 0.0;
for (int j = 1; j < estimationPoints; j++)
AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]);
// if it's not stable, we're done
if (AF_delta > EM_STABILITY_METRIC)
return false;
// it's stable, so record that fact in the stability array
frequencyStabilityArray[i] = true;
}
// if we got here, then we're stable
return true;
}
protected EMOutput computePofF(char ref) {
return null;
}
/**
* A class for the allele-specific genotype likelihoods
*/
protected class AlleleSpecificGenotypeLikelihoods {
private HashMap<String, Pair<double[], double[]>> GLs = new HashMap<String, Pair<double[], double[]>>();
AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) {
double[] likelihoods = GL.getLikelihoods();
double[] posteriors = GL.getPosteriors();
// get the ref likelihood/posterior
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
double refLikelihood = GL.getLikelihood(refGenotype);
double refPosterior = GL.getPosterior(refGenotype);
String refStr = String.valueOf(ref);
for ( char base : BaseUtils.BASES ) {
if ( base == ref )
continue;
// get hom var and het likelihoods
double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base));
double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base)));
}
}
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.*;
@ -130,7 +131,14 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
if ( POOLED_INPUT ) {
sample = "POOL";
} else {
sample = read.getReadGroup().getSample();
SAMReadGroupRecord readGroup = read.getReadGroup();
if ( readGroup == null ) {
if ( assumedSingleSample == null )
throw new StingException("Missing read group for read " + read.getReadName());
sample = assumedSingleSample;
} else {
sample = readGroup.getSample();
}
}
// create a new context object if this is the first time we're seeing a read for this sample

View File

@ -29,6 +29,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected int POOL_SIZE;
protected double LOD_THRESHOLD;
protected int maxDeletionsInPileup;
protected String assumedSingleSample;
protected boolean VERBOSE;
/**
@ -57,6 +58,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
POOL_SIZE = UAC.POOLSIZE;
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
maxDeletionsInPileup = UAC.MAX_DELETIONS;
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
VERBOSE = UAC.VERBOSE;
}
@ -76,10 +78,15 @@ public abstract class GenotypeCalculationModel implements Cloneable {
gcm.POOLED_INPUT = POOLED_INPUT;
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
gcm.assumedSingleSample = assumedSingleSample;
gcm.VERBOSE = VERBOSE;
return gcm;
}
public void setAssumedSingleSample(String sample) {
assumedSingleSample = sample;
}
/**
* Must be overridden by concrete subclasses
* @param tracker rod data

View File

@ -55,6 +55,11 @@ public class UnifiedArgumentCollection {
public boolean VERBOSE = false;
// control the error modes
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
public String ASSUME_SINGLE_SAMPLE = null;
// control the various parameters to be used
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
public double LOD_THRESHOLD = Double.MIN_VALUE;

View File

@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
@ -52,7 +51,7 @@ import java.util.List;
import java.util.ArrayList;
@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
@ReadFilters({ZeroMappingQualityReadFilter.class})
public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@ -83,6 +82,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
/** Enable deletions in the pileup **/
public boolean includeReadsWithDeletionAtLoci() { return true; }
/**
* Sets the single sample to assume when read groups are missing.
* To be used with walkers that call the UnifiedGenotyper's map function
*
**/
public void setAssumedSingleSample(String sample) {
gcm.setAssumedSingleSample(sample);
}
/**
* Initialize the samples, output, and genotype calculation model
*
@ -95,9 +103,14 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
// get all of the unique sample names
samples = new HashSet<String>();
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord readGroup : readGroups )
samples.add(readGroup.getSample());
// if we're supposed to assume a single sample
if ( UAC.ASSUME_SINGLE_SAMPLE != null ) {
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
} else {
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord readGroup : readGroups )
samples.add(readGroup.getSample());
}
// print them out for debugging (need separate loop to ensure uniqueness)
for ( String sample : samples )