first go of the genotyper for the GATK paper. More testing and review tomorrow to call it done.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2080 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7b957d3e2e
commit
7997455f38
|
|
@ -0,0 +1,115 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotypePriors;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class GATKPaperGenotyper
|
||||
* <p/>
|
||||
* A simple Bayesian genotyper, that output a text based call format. Intended to be used only as an
|
||||
* example in the GATK publication.
|
||||
*/
|
||||
public class GATKPaperGenotyper extends LocusWalker<SimpleCall, SimpleCallList> implements TreeReducible<SimpleCallList> {
|
||||
|
||||
// the possible diploid genotype strings
|
||||
private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT }
|
||||
|
||||
// the epsilon value we're using to model our error rate
|
||||
private static double EPSILON = 1e-4;
|
||||
|
||||
@Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true)
|
||||
private File LOCATION = new File("genotyping.output");
|
||||
|
||||
/**
|
||||
* our map function, which takes the reads spanning this locus, any associated reference ordered data,
|
||||
* and the reference information. We output a simple genotype call as the result of this function
|
||||
*
|
||||
* @param tracker the reference ordered data tracker
|
||||
* @param ref the reference information
|
||||
* @param context the locus context, which contains all of the read information
|
||||
* @return a SimpleCall, which stores the genotype we're calling and the LOD score
|
||||
*/
|
||||
public SimpleCall map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case
|
||||
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), context.getReads(), context.getOffsets());
|
||||
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
||||
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
|
||||
DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
|
||||
|
||||
for (GENOTYPE genotype : GENOTYPE.values())
|
||||
for (byte pileupBase : pileup.getBases()) {
|
||||
for (char genotypeBase : genotype.toString().toCharArray())
|
||||
if (genotypeBase == pileupBase)
|
||||
likelihoods[genotype.ordinal()] += 1 / 2 * (1 - EPSILON) + EPSILON / 3;
|
||||
else
|
||||
likelihoods[genotype.ordinal()] += EPSILON / 3;
|
||||
}
|
||||
|
||||
Integer sortedList[] = Utils.SortPermutation(likelihoods);
|
||||
|
||||
// get our reference genotype
|
||||
String refGenotype = (String.valueOf(ref.getBase()) + String.valueOf(ref.getBase())).toUpperCase();
|
||||
|
||||
// create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString())
|
||||
// and calculate the LOD score from best - ref (likelihoods[sortedList[9]] - likelihoods[sortedList[8])
|
||||
return new SimpleCall(context.getLocation(),
|
||||
GENOTYPE.values()[sortedList[9]].toString(),
|
||||
likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations. In this case we simply return an empty list
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public SimpleCallList reduceInit() {
|
||||
return new SimpleCallList(LOCATION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public SimpleCallList reduce(SimpleCall value, SimpleCallList sum) {
|
||||
if (value != null) sum.add(value);
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* A composite, 'reduce of reduces' function.
|
||||
*
|
||||
* @param lhs 'left-most' portion of data in the composite reduce.
|
||||
* @param rhs 'right-most' portion of data in the composite reduce.
|
||||
* @return The composite reduce type.
|
||||
*/
|
||||
public SimpleCallList treeReduce(SimpleCallList lhs, SimpleCallList rhs) {
|
||||
lhs.addAll(rhs);
|
||||
return lhs;
|
||||
}
|
||||
|
||||
/**
|
||||
* when we finish traversing, close the result list
|
||||
* @param result the final reduce result
|
||||
*/
|
||||
public void onTraversalDone(SimpleCallList result) {
|
||||
result.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaron
|
||||
* Date: Nov 19, 2009
|
||||
* Time: 2:07:25 AM
|
||||
*
|
||||
* This simple call class stores the data for the per-locus calls of the GATKPaperGenotyper.
|
||||
*
|
||||
*/
|
||||
class SimpleCall {
|
||||
public String genotype;
|
||||
public double LOD;
|
||||
public GenomeLoc loc;
|
||||
|
||||
SimpleCall(GenomeLoc location, String gt, double lod) {
|
||||
genotype = gt;
|
||||
LOD = lod;
|
||||
loc = location;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("Location %s : %s with LOD %.2f", loc, genotype, LOD);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.AbstractList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaronmckenna
|
||||
* Date: Nov 19, 2009
|
||||
* Time: 12:50:20 AM
|
||||
*
|
||||
* A simple class, that dumps the records to disk when we've hit a threshold.
|
||||
* This class makes the GATKPaperGenotyper much simpler to take in for the reader.
|
||||
*/
|
||||
class SimpleCallList extends AbstractList<SimpleCall> {
|
||||
private File outputLocation;
|
||||
private ArrayList<SimpleCall> list = new ArrayList<SimpleCall>();
|
||||
private int WRITE_LIMIT = 100000;
|
||||
public SimpleCallList(File writeTo) {
|
||||
outputLocation = writeTo;
|
||||
}
|
||||
|
||||
public boolean add(SimpleCall call) {
|
||||
boolean added = list.add(call);
|
||||
writeIfNeeded();
|
||||
return added;
|
||||
}
|
||||
|
||||
public boolean addAll(Collection<? extends SimpleCall> otherCalls) {
|
||||
boolean added = list.addAll(otherCalls);
|
||||
writeIfNeeded();
|
||||
return added;
|
||||
}
|
||||
|
||||
public void writeIfNeeded() {
|
||||
synchronized(list) {
|
||||
if (list.size() > WRITE_LIMIT) {
|
||||
try {
|
||||
PrintWriter writer = new PrintWriter(new FileWriter(outputLocation, true));
|
||||
for (SimpleCall call : list) writer.println(call.toString());
|
||||
writer.close();
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to write to file " + outputLocation);
|
||||
}
|
||||
list.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public int size() {
|
||||
return list.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleCall get(int index) {
|
||||
return list.get(index);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
WRITE_LIMIT = 0;
|
||||
writeIfNeeded();
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue