Joint Estimation model now emits calls in all formats.
The whole GenotypeCall framework needs to be changed, but this will work for the time being. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1902 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a6dc8cd44e
commit
6c338eccb8
|
|
@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
protected Logger logger;
|
protected Logger logger;
|
||||||
protected double heterozygosity;
|
protected double heterozygosity;
|
||||||
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
||||||
|
protected boolean ALL_BASE_MODE;
|
||||||
protected boolean GENOTYPE_MODE;
|
protected boolean GENOTYPE_MODE;
|
||||||
protected boolean POOLED_INPUT;
|
protected boolean POOLED_INPUT;
|
||||||
protected int POOL_SIZE;
|
protected int POOL_SIZE;
|
||||||
protected double LOD_THRESHOLD;
|
protected double LOD_THRESHOLD;
|
||||||
|
protected double CONFIDENCE_THRESHOLD;
|
||||||
protected double MINIMUM_ALLELE_FREQUENCY;
|
protected double MINIMUM_ALLELE_FREQUENCY;
|
||||||
protected int maxDeletionsInPileup;
|
protected int maxDeletionsInPileup;
|
||||||
protected String assumedSingleSample;
|
protected String assumedSingleSample;
|
||||||
|
|
@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
baseModel = UAC.baseModel;
|
baseModel = UAC.baseModel;
|
||||||
heterozygosity = UAC.heterozygosity;
|
heterozygosity = UAC.heterozygosity;
|
||||||
defaultPlatform = UAC.defaultPlatform;
|
defaultPlatform = UAC.defaultPlatform;
|
||||||
|
ALL_BASE_MODE = UAC.ALL_BASES;
|
||||||
GENOTYPE_MODE = UAC.GENOTYPE;
|
GENOTYPE_MODE = UAC.GENOTYPE;
|
||||||
POOLED_INPUT = UAC.POOLED;
|
POOLED_INPUT = UAC.POOLED;
|
||||||
POOL_SIZE = UAC.POOLSIZE;
|
POOL_SIZE = UAC.POOLSIZE;
|
||||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||||
|
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
|
||||||
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||||
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
private double[] PofFs = new double[BaseUtils.BASES.length];
|
private double[] PofFs = new double[BaseUtils.BASES.length];
|
||||||
|
|
||||||
// the minimum and actual number of points in our allele frequency estimation
|
// the minimum and actual number of points in our allele frequency estimation
|
||||||
private static final int MIN_ESTIMATION_POINTS = 100; //1000;
|
private static final int MIN_ESTIMATION_POINTS = 100;
|
||||||
private int frequencyEstimationPoints;
|
private int frequencyEstimationPoints;
|
||||||
|
|
||||||
// the GenotypeLikelihoods map
|
// the GenotypeLikelihoods map
|
||||||
|
|
@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
// run joint estimation for the full GL contexts
|
// run joint estimation for the full GL contexts
|
||||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||||
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
||||||
|
return createCalls(ref, contexts);
|
||||||
|
|
||||||
//double lod = overall.getPofD() - overall.getPofNull();
|
//double lod = overall.getPofD() - overall.getPofNull();
|
||||||
//logger.debug("lod=" + lod);
|
//logger.debug("lod=" + lod);
|
||||||
|
|
@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
// generate the calls
|
// generate the calls
|
||||||
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
|
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
|
||||||
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
|
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initializeAlleleFrequencies(int numSamples) {
|
private void initializeAlleleFrequencies(int numSamples) {
|
||||||
|
|
@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
private void normalizeFromLog10(double[] array) {
|
private void normalizeFromLog10(double[] array) {
|
||||||
// for precision purposes, we need to add (or really subtract, since they're
|
// for precision purposes, we need to add (or really subtract, since they're
|
||||||
// all negative) the largest value; also, we need to convert to normal-space.
|
// all negative) the largest value; also, we need to convert to normal-space.
|
||||||
double maxValue = findMaxEntry(array);
|
double maxValue = findMaxEntry(array).first;
|
||||||
for (int i = 0; i < array.length; i++)
|
for (int i = 0; i < array.length; i++)
|
||||||
array[i] = Math.pow(10, array[i] - maxValue);
|
array[i] = Math.pow(10, array[i] - maxValue);
|
||||||
|
|
||||||
|
|
@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
array[i] /= sum;
|
array[i] /= sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double findMaxEntry(double[] array) {
|
// returns the maximum value in the array and its index
|
||||||
|
private static Pair<Double, Integer> findMaxEntry(double[] array) {
|
||||||
|
int index = 0;
|
||||||
double max = array[0];
|
double max = array[0];
|
||||||
for (int index = 1; index < array.length; index++) {
|
for (int i = 1; i < array.length; i++) {
|
||||||
if ( array[index] > max )
|
if ( array[i] > max ) {
|
||||||
max = array[index];
|
max = array[i];
|
||||||
|
index = i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return max;
|
return new Pair<Double, Integer>(max, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
|
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
|
||||||
|
|
@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
||||||
char base = Character.toLowerCase(altAllele);
|
char base = Character.toLowerCase(altAllele);
|
||||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
|
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
|
||||||
|
verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||||
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
verboseWriter.println();
|
verboseWriter.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Pair<List<GenotypeCall>, GenotypeMetaData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts) {
|
||||||
|
// first, find the alt allele with maximum confidence
|
||||||
|
int indexOfMax = 0;
|
||||||
|
char baseOfMax = ref;
|
||||||
|
double maxConfidence = Double.MIN_VALUE;
|
||||||
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
if ( altAllele != ref ) {
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
if ( PofFs[baseIndex] > maxConfidence ) {
|
||||||
|
indexOfMax = baseIndex;
|
||||||
|
baseOfMax = altAllele;
|
||||||
|
maxConfidence = PofFs[baseIndex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||||
|
double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1);
|
||||||
|
|
||||||
|
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
|
||||||
|
// TODO -- generate strand score
|
||||||
|
double strandScore = 0.0;
|
||||||
|
GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess);
|
||||||
|
|
||||||
|
// generate calls only if we pass the threshold or we want ref calls
|
||||||
|
if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) {
|
||||||
|
|
||||||
|
for ( String sample : GLs.keySet() ) {
|
||||||
|
// get the pileup
|
||||||
|
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
|
||||||
|
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||||
|
pileup.setIncludeDeletionsInPileupString(true);
|
||||||
|
// TODO -- fix GenotypeCall code so that each call doesn't need its own pileup
|
||||||
|
|
||||||
|
// create the call
|
||||||
|
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
|
||||||
|
calls.add(call);
|
||||||
|
|
||||||
|
// TODO -- fix GenotypeCall code so that UG tells it which genotypes to use
|
||||||
|
// TODO -- all of the intelligence for making calls should be in UG
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Pair<List<GenotypeCall>, GenotypeMetaData>(calls, metadata);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -51,22 +51,28 @@ public class UnifiedArgumentCollection {
|
||||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||||
public boolean GENOTYPE = false;
|
public boolean GENOTYPE = false;
|
||||||
|
|
||||||
@Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
|
||||||
|
public boolean ALL_BASES = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||||
public String VERBOSE = null;
|
public String VERBOSE = null;
|
||||||
|
|
||||||
|
|
||||||
// control the error modes
|
// control the error modes
|
||||||
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||||
public String ASSUME_SINGLE_SAMPLE = null;
|
public String ASSUME_SINGLE_SAMPLE = null;
|
||||||
|
|
||||||
|
|
||||||
// control the various parameters to be used
|
|
||||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
|
||||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
|
||||||
|
|
||||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||||
|
|
||||||
|
|
||||||
|
// control the various parameters to be used
|
||||||
|
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
|
||||||
|
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||||
|
|
||||||
|
@Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
|
||||||
|
public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
|
||||||
|
|
||||||
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
|
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
|
||||||
public Integer MAX_DELETIONS = 1;
|
public Integer MAX_DELETIONS = 1;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue