Joint Estimation model now emits calls in all formats.
The whole GenotypeCall framework needs to be changed, but this will work for the time being. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1902 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a6dc8cd44e
commit
6c338eccb8
|
|
@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
protected Logger logger;
|
||||
protected double heterozygosity;
|
||||
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
||||
protected boolean ALL_BASE_MODE;
|
||||
protected boolean GENOTYPE_MODE;
|
||||
protected boolean POOLED_INPUT;
|
||||
protected int POOL_SIZE;
|
||||
protected double LOD_THRESHOLD;
|
||||
protected double CONFIDENCE_THRESHOLD;
|
||||
protected double MINIMUM_ALLELE_FREQUENCY;
|
||||
protected int maxDeletionsInPileup;
|
||||
protected String assumedSingleSample;
|
||||
|
|
@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
baseModel = UAC.baseModel;
|
||||
heterozygosity = UAC.heterozygosity;
|
||||
defaultPlatform = UAC.defaultPlatform;
|
||||
ALL_BASE_MODE = UAC.ALL_BASES;
|
||||
GENOTYPE_MODE = UAC.GENOTYPE;
|
||||
POOLED_INPUT = UAC.POOLED;
|
||||
POOL_SIZE = UAC.POOLSIZE;
|
||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
|
||||
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
private double[] PofFs = new double[BaseUtils.BASES.length];
|
||||
|
||||
// the minimum and actual number of points in our allele frequency estimation
|
||||
private static final int MIN_ESTIMATION_POINTS = 100; //1000;
|
||||
private static final int MIN_ESTIMATION_POINTS = 100;
|
||||
private int frequencyEstimationPoints;
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
|
|
@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
// run joint estimation for the full GL contexts
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
||||
|
||||
return createCalls(ref, contexts);
|
||||
|
||||
//double lod = overall.getPofD() - overall.getPofNull();
|
||||
//logger.debug("lod=" + lod);
|
||||
|
|
@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
// generate the calls
|
||||
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
|
||||
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
|
||||
return null;
|
||||
}
|
||||
|
||||
private void initializeAlleleFrequencies(int numSamples) {
|
||||
|
|
@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
private void normalizeFromLog10(double[] array) {
|
||||
// for precision purposes, we need to add (or really subtract, since they're
|
||||
// all negative) the largest value; also, we need to convert to normal-space.
|
||||
double maxValue = findMaxEntry(array);
|
||||
double maxValue = findMaxEntry(array).first;
|
||||
for (int i = 0; i < array.length; i++)
|
||||
array[i] = Math.pow(10, array[i] - maxValue);
|
||||
|
||||
|
|
@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
array[i] /= sum;
|
||||
}
|
||||
|
||||
private static double findMaxEntry(double[] array) {
|
||||
// returns the maximum value in the array and its index
|
||||
private static Pair<Double, Integer> findMaxEntry(double[] array) {
|
||||
int index = 0;
|
||||
double max = array[0];
|
||||
for (int index = 1; index < array.length; index++) {
|
||||
if ( array[index] > max )
|
||||
max = array[index];
|
||||
for (int i = 1; i < array.length; i++) {
|
||||
if ( array[i] > max ) {
|
||||
max = array[i];
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
return max;
|
||||
return new Pair<Double, Integer>(max, index);
|
||||
}
|
||||
|
||||
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
|
||||
|
|
@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
char base = Character.toLowerCase(altAllele);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
|
||||
verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
}
|
||||
}
|
||||
verboseWriter.println();
|
||||
}
|
||||
|
||||
private Pair<List<GenotypeCall>, GenotypeMetaData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts) {
|
||||
// first, find the alt allele with maximum confidence
|
||||
int indexOfMax = 0;
|
||||
char baseOfMax = ref;
|
||||
double maxConfidence = Double.MIN_VALUE;
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( PofFs[baseIndex] > maxConfidence ) {
|
||||
indexOfMax = baseIndex;
|
||||
baseOfMax = altAllele;
|
||||
maxConfidence = PofFs[baseIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||
double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1);
|
||||
|
||||
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
|
||||
// TODO -- generate strand score
|
||||
double strandScore = 0.0;
|
||||
GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess);
|
||||
|
||||
// generate calls only if we pass the threshold or we want ref calls
|
||||
if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) {
|
||||
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
// get the pileup
|
||||
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
pileup.setIncludeDeletionsInPileupString(true);
|
||||
// TODO -- fix GenotypeCall code so that each call doesn't need its own pileup
|
||||
|
||||
// create the call
|
||||
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
|
||||
calls.add(call);
|
||||
|
||||
// TODO -- fix GenotypeCall code so that UG tells it which genotypes to use
|
||||
// TODO -- all of the intelligence for making calls should be in UG
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<List<GenotypeCall>, GenotypeMetaData>(calls, metadata);
|
||||
}
|
||||
}
|
||||
|
|
@ -51,22 +51,28 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||
public boolean GENOTYPE = false;
|
||||
|
||||
@Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
|
||||
public boolean ALL_BASES = false;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
public String VERBOSE = null;
|
||||
|
||||
|
||||
// control the error modes
|
||||
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
public String ASSUME_SINGLE_SAMPLE = null;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||
|
||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
|
||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||
|
||||
@Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
|
||||
public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
|
||||
|
||||
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
|
||||
public Integer MAX_DELETIONS = 1;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue