Don't filter out reads without proper read groups. Instead, allow the user (or another walker calling UG) to specify an assumed sample to use (but then we assume single-sample mode).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1883 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a8a2c1a2a1
commit
b8ab77c91c
|
|
@ -1,8 +1,8 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.genotype.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -10,33 +10,69 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
||||||
|
|
||||||
protected AllMAFsGenotypeCalculationModel() {}
|
protected AllMAFsGenotypeCalculationModel() {}
|
||||||
|
|
||||||
private double[] alleleFrequencies;
|
// because the null allele frequencies are constant for a given N,
|
||||||
|
// we cache the results to avoid having to recompute everything
|
||||||
|
private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();
|
||||||
|
|
||||||
|
// the allele frequencies
|
||||||
|
private double[][] alleleFrequencies = new double[3][];
|
||||||
|
private double[][] oldAlleleFrequencies;
|
||||||
|
|
||||||
|
// keep track of whether or not a given MAF is stable
|
||||||
|
private boolean[] frequencyStabilityArray = new boolean[3];
|
||||||
|
|
||||||
|
// the minimum and actual number of points in our allele frequency estimation
|
||||||
|
private static final int MIN_ESTIMATION_POINTS = 1000;
|
||||||
|
private int estimationPoints;
|
||||||
|
|
||||||
|
// the GenotypeLikelihoods map
|
||||||
|
private HashMap<String, AlleleSpecificGenotypeLikelihoods> GLs = new HashMap<String, AlleleSpecificGenotypeLikelihoods>();
|
||||||
|
|
||||||
|
|
||||||
protected void initializeAlleleFrequencies(int numSamples, char ref) {
|
protected void initializeAlleleFrequencies(int numSamples, char ref) {
|
||||||
// we have 2N possible allele frequencies in pileup
|
// first, initialize the stability array to "unstable"
|
||||||
int possibleMAFs = 2 * numSamples;
|
for (int i = 0; i < 3; i++)
|
||||||
alleleFrequencies = new double[possibleMAFs];
|
frequencyStabilityArray[i] = false;
|
||||||
|
|
||||||
// calculate sum(1/i) for i from 1 to 2N
|
// calculate the number of estimation points to use:
|
||||||
|
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
|
||||||
|
// (add 1 for allele frequency of zero)
|
||||||
|
estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
|
||||||
|
|
||||||
|
for (int alt = 0; alt < 3; alt++)
|
||||||
|
alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints);
|
||||||
|
|
||||||
|
for (int i = 1; i < estimationPoints; i++)
|
||||||
|
logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] getNullAlleleFrequencies(int N) {
|
||||||
|
double[] AFs = nullAlleleFrequencyCache.get(N);
|
||||||
|
|
||||||
|
// if it hasn't been calculated yet, do so now
|
||||||
|
if ( AFs == null ) {
|
||||||
|
|
||||||
|
// calculate sum(1/i)
|
||||||
double denominator = 0.0;
|
double denominator = 0.0;
|
||||||
for (int i = 1; i <= possibleMAFs; i++)
|
for (int i = 1; i < N; i++)
|
||||||
denominator += 1.0 / (double)i;
|
denominator += 1.0 / (double)i;
|
||||||
|
|
||||||
// set up delta
|
// set up delta
|
||||||
double delta = 1.0 / denominator;
|
double delta = 1.0 / denominator;
|
||||||
|
|
||||||
// calculate the null allele frequencies
|
// calculate the null allele frequencies
|
||||||
for (int i = 1; i <= possibleMAFs; i++)
|
AFs = new double[N];
|
||||||
alleleFrequencies[i-1] = Math.log10(delta / (double)i);
|
for (int i = 1; i < N; i++)
|
||||||
|
AFs[i] = Math.log10(delta / (double)i);
|
||||||
|
|
||||||
for (int i = 0; i < possibleMAFs; i++)
|
nullAlleleFrequencyCache.put(N, AFs);
|
||||||
logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
|
}
|
||||||
|
|
||||||
|
return AFs.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||||
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
GLs.clear();
|
||||||
|
|
||||||
for ( String sample : contexts.keySet() ) {
|
for ( String sample : contexts.keySet() ) {
|
||||||
AlignmentContextBySample context = contexts.get(sample);
|
AlignmentContextBySample context = contexts.get(sample);
|
||||||
|
|
@ -47,7 +83,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
||||||
GL.setVerbose(VERBOSE);
|
GL.setVerbose(VERBOSE);
|
||||||
GL.add(pileup, true);
|
GL.add(pileup, true);
|
||||||
|
|
||||||
GLs.put(sample, GL);
|
GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -60,10 +96,67 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean isStable() {
|
protected boolean isStable() {
|
||||||
|
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
|
||||||
|
// We compute this separately for all of the alternate alleles
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
// if we've already determined that a MAF is stable, don't recalculate
|
||||||
|
if ( frequencyStabilityArray[i] )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// determine change
|
||||||
|
double AF_delta = 0.0;
|
||||||
|
for (int j = 1; j < estimationPoints; j++)
|
||||||
|
AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]);
|
||||||
|
|
||||||
|
// if it's not stable, we're done
|
||||||
|
if (AF_delta > EM_STABILITY_METRIC)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// it's stable, so record that fact in the stability array
|
||||||
|
frequencyStabilityArray[i] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we got here, then we're stable
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected EMOutput computePofF(char ref) {
|
protected EMOutput computePofF(char ref) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A class for the allele-specific genotype likelihoods
|
||||||
|
*/
|
||||||
|
protected class AlleleSpecificGenotypeLikelihoods {
|
||||||
|
private HashMap<String, Pair<double[], double[]>> GLs = new HashMap<String, Pair<double[], double[]>>();
|
||||||
|
|
||||||
|
AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) {
|
||||||
|
double[] likelihoods = GL.getLikelihoods();
|
||||||
|
double[] posteriors = GL.getPosteriors();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// get the ref likelihood/posterior
|
||||||
|
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||||
|
double refLikelihood = GL.getLikelihood(refGenotype);
|
||||||
|
double refPosterior = GL.getPosterior(refGenotype);
|
||||||
|
String refStr = String.valueOf(ref);
|
||||||
|
|
||||||
|
for ( char base : BaseUtils.BASES ) {
|
||||||
|
if ( base == ref )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// get hom var and het likelihoods
|
||||||
|
double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base));
|
||||||
|
double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base)));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
|
@ -130,7 +131,14 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
||||||
if ( POOLED_INPUT ) {
|
if ( POOLED_INPUT ) {
|
||||||
sample = "POOL";
|
sample = "POOL";
|
||||||
} else {
|
} else {
|
||||||
sample = read.getReadGroup().getSample();
|
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||||
|
if ( readGroup == null ) {
|
||||||
|
if ( assumedSingleSample == null )
|
||||||
|
throw new StingException("Missing read group for read " + read.getReadName());
|
||||||
|
sample = assumedSingleSample;
|
||||||
|
} else {
|
||||||
|
sample = readGroup.getSample();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// create a new context object if this is the first time we're seeing a read for this sample
|
// create a new context object if this is the first time we're seeing a read for this sample
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
protected int POOL_SIZE;
|
protected int POOL_SIZE;
|
||||||
protected double LOD_THRESHOLD;
|
protected double LOD_THRESHOLD;
|
||||||
protected int maxDeletionsInPileup;
|
protected int maxDeletionsInPileup;
|
||||||
|
protected String assumedSingleSample;
|
||||||
protected boolean VERBOSE;
|
protected boolean VERBOSE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -57,6 +58,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
POOL_SIZE = UAC.POOLSIZE;
|
POOL_SIZE = UAC.POOLSIZE;
|
||||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||||
|
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||||
VERBOSE = UAC.VERBOSE;
|
VERBOSE = UAC.VERBOSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -76,10 +78,15 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
gcm.POOLED_INPUT = POOLED_INPUT;
|
gcm.POOLED_INPUT = POOLED_INPUT;
|
||||||
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
||||||
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
||||||
|
gcm.assumedSingleSample = assumedSingleSample;
|
||||||
gcm.VERBOSE = VERBOSE;
|
gcm.VERBOSE = VERBOSE;
|
||||||
return gcm;
|
return gcm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setAssumedSingleSample(String sample) {
|
||||||
|
assumedSingleSample = sample;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Must be overridden by concrete subclasses
|
* Must be overridden by concrete subclasses
|
||||||
* @param tracker rod data
|
* @param tracker rod data
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,11 @@ public class UnifiedArgumentCollection {
|
||||||
public boolean VERBOSE = false;
|
public boolean VERBOSE = false;
|
||||||
|
|
||||||
|
|
||||||
|
// control the error modes
|
||||||
|
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||||
|
public String ASSUME_SINGLE_SAMPLE = null;
|
||||||
|
|
||||||
|
|
||||||
// control the various parameters to be used
|
// control the various parameters to be used
|
||||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
||||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
|
|
@ -52,7 +51,7 @@ import java.util.List;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
|
||||||
@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
|
@ReadFilters({ZeroMappingQualityReadFilter.class})
|
||||||
public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {
|
public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {
|
||||||
|
|
||||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||||
|
|
@ -83,6 +82,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
||||||
/** Enable deletions in the pileup **/
|
/** Enable deletions in the pileup **/
|
||||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the single sample to assume when read groups are missing.
|
||||||
|
* To be used with walkers that call the UnifiedGenotyper's map function
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public void setAssumedSingleSample(String sample) {
|
||||||
|
gcm.setAssumedSingleSample(sample);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the samples, output, and genotype calculation model
|
* Initialize the samples, output, and genotype calculation model
|
||||||
*
|
*
|
||||||
|
|
@ -95,9 +103,14 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
||||||
|
|
||||||
// get all of the unique sample names
|
// get all of the unique sample names
|
||||||
samples = new HashSet<String>();
|
samples = new HashSet<String>();
|
||||||
|
// if we're supposed to assume a single sample
|
||||||
|
if ( UAC.ASSUME_SINGLE_SAMPLE != null ) {
|
||||||
|
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||||
|
} else {
|
||||||
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
||||||
for ( SAMReadGroupRecord readGroup : readGroups )
|
for ( SAMReadGroupRecord readGroup : readGroups )
|
||||||
samples.add(readGroup.getSample());
|
samples.add(readGroup.getSample());
|
||||||
|
}
|
||||||
|
|
||||||
// print them out for debugging (need separate loop to ensure uniqueness)
|
// print them out for debugging (need separate loop to ensure uniqueness)
|
||||||
for ( String sample : samples )
|
for ( String sample : samples )
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue