clean-up of the GATK paper genotyper, and better output formatting for the simple call format we emit.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2529 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7e3e714d3c
commit
576594eda2
|
|
@ -16,17 +16,19 @@ import java.io.PrintStream;
|
||||||
/**
|
/**
|
||||||
* A simple Bayesian genotyper, that outputs a text based call format. Intended to be used only as an
|
* A simple Bayesian genotyper, that outputs a text based call format. Intended to be used only as an
|
||||||
* example in the GATK publication.
|
* example in the GATK publication.
|
||||||
|
*
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @help.summary A simple, naive Bayesian genotyper that is used as an example locus walker in the GATK paper. THIS IS NOT TO BE USED FOR ANY ANALYSIS
|
|
||||||
*/
|
*/
|
||||||
public class GATKPaperGenotyper extends LocusWalker<SimpleCall, Integer> implements TreeReducible<Integer> {
|
public class GATKPaperGenotyper extends LocusWalker<SimpleCall, Integer> implements TreeReducible<Integer> {
|
||||||
|
|
||||||
// the possible diploid genotype strings
|
// the possible diploid genotype strings
|
||||||
private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT }
|
private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT }
|
||||||
|
|
||||||
// where to write the genotyping data to
|
|
||||||
@Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true)
|
@Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true)
|
||||||
public PrintStream outputStream;
|
private PrintStream outputStream;
|
||||||
|
|
||||||
|
@Argument(fullName = "log_odds_score", shortName = "LOD", doc = "The LOD threshold for us to call confidently a genotype", required = false)
|
||||||
|
private double LODScore = 3.0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* our map function, which takes the reads spanning this locus, any associated reference ordered data,
|
* our map function, which takes the reads spanning this locus, any associated reference ordered data,
|
||||||
|
|
@ -44,30 +46,33 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, Integer> impleme
|
||||||
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
||||||
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
|
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
|
||||||
DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
|
DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
|
||||||
|
// get the bases and qualities from the pileup
|
||||||
byte bases[] = pileup.getBases();
|
byte bases[] = pileup.getBases();
|
||||||
byte quals[] = pileup.getQuals();
|
byte quals[] = pileup.getQuals();
|
||||||
|
|
||||||
|
// for each genotype, determine it's likelihood value
|
||||||
for (GENOTYPE genotype : GENOTYPE.values())
|
for (GENOTYPE genotype : GENOTYPE.values())
|
||||||
for (int index = 0; index < bases.length; index++) {
|
for (int index = 0; index < bases.length; index++) {
|
||||||
if (quals[index] > 0) {
|
if (quals[index] > 0) {
|
||||||
|
// our epsilon is the de-Phred scored base quality
|
||||||
double epsilon = Math.pow(10, quals[index] / -10.0);
|
double epsilon = Math.pow(10, quals[index] / -10.0);
|
||||||
|
|
||||||
byte pileupBase = bases[index];
|
byte pileupBase = bases[index];
|
||||||
for (char genotypeBase : genotype.toString().toCharArray()) {
|
double p = 0;
|
||||||
double p = genotypeBase == pileupBase ? 1 - epsilon : epsilon / 3;
|
for (char r : genotype.toString().toCharArray())
|
||||||
likelihoods[genotype.ordinal()] += Math.log10(p / 2);
|
p += r == pileupBase ? 1 - epsilon : epsilon / 3;
|
||||||
}
|
likelihoods[genotype.ordinal()] += Math.log10(p / genotype.toString().length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Integer sortedList[] = Utils.SortPermutation(likelihoods);
|
Integer sortedList[] = Utils.SortPermutation(likelihoods);
|
||||||
|
|
||||||
// get our reference genotype
|
|
||||||
String refGenotype = (String.valueOf(ref.getBase()) + String.valueOf(ref.getBase())).toUpperCase();
|
|
||||||
|
|
||||||
// create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString())
|
// create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString())
|
||||||
// and calculate the LOD score from best - ref (likelihoods[sortedList[9]] - likelihoods[sortedList[8])
|
// and calculate the LOD score from best - next best (9 and 8 in the sorted list, since the best likelihoods are closest to zero)
|
||||||
return new SimpleCall(context.getLocation(),
|
return new SimpleCall(context.getLocation(),
|
||||||
GENOTYPE.values()[sortedList[9]].toString(),
|
GENOTYPE.values()[sortedList[9]].toString(),
|
||||||
likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]);
|
likelihoods[sortedList[9]] - likelihoods[sortedList[8]],
|
||||||
|
ref.getBase());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -80,14 +85,16 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, Integer> impleme
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
* Reduces a single map with the accumulator provided as the ReduceType. We filter out calls,
|
||||||
|
* first making sure that the call is != null, secondly that the LOD score is above a moderate
|
||||||
|
* threshold (in this case 3).
|
||||||
*
|
*
|
||||||
* @param value result of the map.
|
* @param value result of the map.
|
||||||
* @param sum accumulator for the reduce.
|
* @param sum accumulator for the reduce.
|
||||||
* @return accumulator with result of the map taken into account.
|
* @return accumulator with result of the map taken into account.
|
||||||
*/
|
*/
|
||||||
public Integer reduce(SimpleCall value, Integer sum) {
|
public Integer reduce(SimpleCall value, Integer sum) {
|
||||||
if (value != null) outputStream.println(value.toString());
|
if (value != null && value.LOD > LODScore) outputStream.println(value.toString());
|
||||||
return sum + 1;
|
return sum + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,14 +15,15 @@ class SimpleCall {
|
||||||
public String genotype;
|
public String genotype;
|
||||||
public double LOD;
|
public double LOD;
|
||||||
public GenomeLoc loc;
|
public GenomeLoc loc;
|
||||||
|
public char ref;
|
||||||
SimpleCall(GenomeLoc location, String gt, double lod) {
|
SimpleCall(GenomeLoc location, String gt, double lod, char reference) {
|
||||||
genotype = gt;
|
genotype = gt;
|
||||||
LOD = lod;
|
LOD = lod;
|
||||||
loc = location;
|
loc = location;
|
||||||
|
this.ref = reference;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format("%s : %s with LOD %.4f", loc, genotype, LOD);
|
return String.format("%s\t%s\t%.4f\t%c", loc, genotype, LOD,ref);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue