Don't filter out reads without proper read groups. Instead, allow the user (or another walker calling UG) to specify an assumed sample to use (but then we assume single-sample mode).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1883 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a8a2c1a2a1
commit
b8ab77c91c
|
|
@ -1,8 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -10,33 +10,69 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
|||
|
||||
protected AllMAFsGenotypeCalculationModel() {}
|
||||
|
||||
private double[] alleleFrequencies;
|
||||
// because the null allele frequencies are constant for a given N,
|
||||
// we cache the results to avoid having to recompute everything
|
||||
private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();
|
||||
|
||||
// the allele frequencies
|
||||
private double[][] alleleFrequencies = new double[3][];
|
||||
private double[][] oldAlleleFrequencies;
|
||||
|
||||
// keep track of whether or not a given MAF is stable
|
||||
private boolean[] frequencyStabilityArray = new boolean[3];
|
||||
|
||||
// the minimum and actual number of points in our allele frequency estimation
|
||||
private static final int MIN_ESTIMATION_POINTS = 1000;
|
||||
private int estimationPoints;
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
private HashMap<String, AlleleSpecificGenotypeLikelihoods> GLs = new HashMap<String, AlleleSpecificGenotypeLikelihoods>();
|
||||
|
||||
|
||||
protected void initializeAlleleFrequencies(int numSamples, char ref) {
|
||||
// we have 2N possible allele frequencies in pileup
|
||||
int possibleMAFs = 2 * numSamples;
|
||||
alleleFrequencies = new double[possibleMAFs];
|
||||
// first, initialize the stability array to "unstable"
|
||||
for (int i = 0; i < 3; i++)
|
||||
frequencyStabilityArray[i] = false;
|
||||
|
||||
// calculate sum(1/i) for i from 1 to 2N
|
||||
double denominator = 0.0;
|
||||
for (int i = 1; i <= possibleMAFs; i++)
|
||||
denominator += 1.0 / (double)i;
|
||||
// calculate the number of estimation points to use:
|
||||
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
|
||||
// (add 1 for allele frequency of zero)
|
||||
estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
|
||||
|
||||
// set up delta
|
||||
double delta = 1.0 / denominator;
|
||||
for (int alt = 0; alt < 3; alt++)
|
||||
alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints);
|
||||
|
||||
// calculate the null allele frequencies
|
||||
for (int i = 1; i <= possibleMAFs; i++)
|
||||
alleleFrequencies[i-1] = Math.log10(delta / (double)i);
|
||||
for (int i = 1; i < estimationPoints; i++)
|
||||
logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < possibleMAFs; i++)
|
||||
logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
|
||||
private double[] getNullAlleleFrequencies(int N) {
|
||||
double[] AFs = nullAlleleFrequencyCache.get(N);
|
||||
|
||||
// if it hasn't been calculated yet, do so now
|
||||
if ( AFs == null ) {
|
||||
|
||||
// calculate sum(1/i)
|
||||
double denominator = 0.0;
|
||||
for (int i = 1; i < N; i++)
|
||||
denominator += 1.0 / (double)i;
|
||||
|
||||
// set up delta
|
||||
double delta = 1.0 / denominator;
|
||||
|
||||
// calculate the null allele frequencies
|
||||
AFs = new double[N];
|
||||
for (int i = 1; i < N; i++)
|
||||
AFs[i] = Math.log10(delta / (double)i);
|
||||
|
||||
nullAlleleFrequencyCache.put(N, AFs);
|
||||
}
|
||||
|
||||
return AFs.clone();
|
||||
}
|
||||
|
||||
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
GLs.clear();
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
|
|
@ -47,7 +83,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
|||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
|
||||
GLs.put(sample, GL);
|
||||
GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -60,10 +96,67 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
|||
}
|
||||
|
||||
protected boolean isStable() {
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
|
||||
// We compute this separately for all of the alternate alleles
|
||||
for (int i = 0; i < 3; i++) {
|
||||
// if we've already determined that a MAF is stable, don't recalculate
|
||||
if ( frequencyStabilityArray[i] )
|
||||
continue;
|
||||
|
||||
// determine change
|
||||
double AF_delta = 0.0;
|
||||
for (int j = 1; j < estimationPoints; j++)
|
||||
AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]);
|
||||
|
||||
// if it's not stable, we're done
|
||||
if (AF_delta > EM_STABILITY_METRIC)
|
||||
return false;
|
||||
|
||||
// it's stable, so record that fact in the stability array
|
||||
frequencyStabilityArray[i] = true;
|
||||
}
|
||||
|
||||
// if we got here, then we're stable
|
||||
return true;
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A class for the allele-specific genotype likelihoods
|
||||
*/
|
||||
protected class AlleleSpecificGenotypeLikelihoods {
|
||||
private HashMap<String, Pair<double[], double[]>> GLs = new HashMap<String, Pair<double[], double[]>>();
|
||||
|
||||
AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) {
|
||||
double[] likelihoods = GL.getLikelihoods();
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
|
||||
|
||||
// get the ref likelihood/posterior
|
||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||
double refLikelihood = GL.getLikelihood(refGenotype);
|
||||
double refPosterior = GL.getPosterior(refGenotype);
|
||||
String refStr = String.valueOf(ref);
|
||||
|
||||
for ( char base : BaseUtils.BASES ) {
|
||||
if ( base == ref )
|
||||
continue;
|
||||
|
||||
// get hom var and het likelihoods
|
||||
double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base));
|
||||
double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base)));
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -130,7 +131,14 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
if ( POOLED_INPUT ) {
|
||||
sample = "POOL";
|
||||
} else {
|
||||
sample = read.getReadGroup().getSample();
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
if ( readGroup == null ) {
|
||||
if ( assumedSingleSample == null )
|
||||
throw new StingException("Missing read group for read " + read.getReadName());
|
||||
sample = assumedSingleSample;
|
||||
} else {
|
||||
sample = readGroup.getSample();
|
||||
}
|
||||
}
|
||||
|
||||
// create a new context object if this is the first time we're seeing a read for this sample
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
protected int POOL_SIZE;
|
||||
protected double LOD_THRESHOLD;
|
||||
protected int maxDeletionsInPileup;
|
||||
protected String assumedSingleSample;
|
||||
protected boolean VERBOSE;
|
||||
|
||||
/**
|
||||
|
|
@ -57,6 +58,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
POOL_SIZE = UAC.POOLSIZE;
|
||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||
VERBOSE = UAC.VERBOSE;
|
||||
}
|
||||
|
||||
|
|
@ -76,10 +78,15 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
gcm.POOLED_INPUT = POOLED_INPUT;
|
||||
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
||||
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
||||
gcm.assumedSingleSample = assumedSingleSample;
|
||||
gcm.VERBOSE = VERBOSE;
|
||||
return gcm;
|
||||
}
|
||||
|
||||
public void setAssumedSingleSample(String sample) {
|
||||
assumedSingleSample = sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
|
|
|
|||
|
|
@ -55,6 +55,11 @@ public class UnifiedArgumentCollection {
|
|||
public boolean VERBOSE = false;
|
||||
|
||||
|
||||
// control the error modes
|
||||
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
public String ASSUME_SINGLE_SAMPLE = null;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
|
|
@ -52,7 +51,7 @@ import java.util.List;
|
|||
import java.util.ArrayList;
|
||||
|
||||
|
||||
@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
|
||||
@ReadFilters({ZeroMappingQualityReadFilter.class})
|
||||
public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
|
@ -83,6 +82,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
|||
/** Enable deletions in the pileup **/
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
/**
|
||||
* Sets the single sample to assume when read groups are missing.
|
||||
* To be used with walkers that call the UnifiedGenotyper's map function
|
||||
*
|
||||
**/
|
||||
public void setAssumedSingleSample(String sample) {
|
||||
gcm.setAssumedSingleSample(sample);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the samples, output, and genotype calculation model
|
||||
*
|
||||
|
|
@ -95,9 +103,14 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
|||
|
||||
// get all of the unique sample names
|
||||
samples = new HashSet<String>();
|
||||
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
||||
for ( SAMReadGroupRecord readGroup : readGroups )
|
||||
samples.add(readGroup.getSample());
|
||||
// if we're supposed to assume a single sample
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null ) {
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
} else {
|
||||
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
||||
for ( SAMReadGroupRecord readGroup : readGroups )
|
||||
samples.add(readGroup.getSample());
|
||||
}
|
||||
|
||||
// print them out for debugging (need separate loop to ensure uniqueness)
|
||||
for ( String sample : samples )
|
||||
|
|
|
|||
Loading…
Reference in New Issue