From 7997455f38324e7d485adfb3d1043a4f8dce2e87 Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 19 Nov 2009 07:55:24 +0000 Subject: [PATCH] first go of the genotyper for the GATK paper. More testing and review tomorrow to call it done. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2080 348d0f76-0448-11de-a6fe-93d51630548a --- .../papergenotyper/GATKPaperGenotyper.java | 115 ++++++++++++++++++ .../walkers/papergenotyper/SimpleCall.java | 28 +++++ .../papergenotyper/SimpleCallList.java | 68 +++++++++++ 3 files changed, 211 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCall.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCallList.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java new file mode 100644 index 000000000..461b25197 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java @@ -0,0 +1,115 @@ +package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotypePriors; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.io.File; + + +/** + * @author aaron + *

+ * Class GATKPaperGenotyper + *

+ * A simple Bayesian genotyper, that output a text based call format. Intended to be used only as an + * example in the GATK publication. + */ +public class GATKPaperGenotyper extends LocusWalker implements TreeReducible { + + // the possible diploid genotype strings + private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT } + + // the epsilon value we're using to model our error rate + private static double EPSILON = 1e-4; + + @Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true) + private File LOCATION = new File("genotyping.output"); + + /** + * our map function, which takes the reads spanning this locus, any associated reference ordered data, + * and the reference information. We output a simple genotype call as the result of this function + * + * @param tracker the reference ordered data tracker + * @param ref the reference information + * @param context the locus context, which contains all of the read information + * @return a SimpleCall, which stores the genotype we're calling and the LOD score + */ + public SimpleCall map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case + + ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), context.getReads(), context.getOffsets()); + double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(), + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, + DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); + + for (GENOTYPE genotype : GENOTYPE.values()) + for (byte pileupBase : pileup.getBases()) { + for (char genotypeBase : genotype.toString().toCharArray()) + if (genotypeBase == pileupBase) + likelihoods[genotype.ordinal()] += 1 / 2 * (1 - EPSILON) + EPSILON / 3; + else + likelihoods[genotype.ordinal()] += EPSILON / 3; + } + + Integer sortedList[] = Utils.SortPermutation(likelihoods); + + // get our reference genotype + String refGenotype = (String.valueOf(ref.getBase()) + String.valueOf(ref.getBase())).toUpperCase(); + + // create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString()) + // and calculate the LOD score from best - ref (likelihoods[sortedList[9]] - likelihoods[sortedList[8]) + return new SimpleCall(context.getLocation(), + GENOTYPE.values()[sortedList[9]].toString(), + likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]); + } + + /** + * Provide an initial value for reduce computations. In this case we simply return an empty list + * + * @return Initial value of reduce. + */ + public SimpleCallList reduceInit() { + return new SimpleCallList(LOCATION); + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public SimpleCallList reduce(SimpleCall value, SimpleCallList sum) { + if (value != null) sum.add(value); + return sum; + } + + /** + * A composite, 'reduce of reduces' function. + * + * @param lhs 'left-most' portion of data in the composite reduce. + * @param rhs 'right-most' portion of data in the composite reduce. + * @return The composite reduce type. + */ + public SimpleCallList treeReduce(SimpleCallList lhs, SimpleCallList rhs) { + lhs.addAll(rhs); + return lhs; + } + + /** + * when we finish traversing, close the result list + * @param result the final reduce result + */ + public void onTraversalDone(SimpleCallList result) { + result.close(); + } +} + + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCall.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCall.java new file mode 100644 index 000000000..4eeaa2164 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCall.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * Created by IntelliJ IDEA. + * User: aaron + * Date: Nov 19, 2009 + * Time: 2:07:25 AM + * + * This simple call class stores the data for the per-locus calls of the GATKPaperGenotyper. + * + */ +class SimpleCall { + public String genotype; + public double LOD; + public GenomeLoc loc; + + SimpleCall(GenomeLoc location, String gt, double lod) { + genotype = gt; + LOD = lod; + loc = location; + } + + public String toString() { + return String.format("Location %s : %s with LOD %.2f", loc, genotype, LOD); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCallList.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCallList.java new file mode 100644 index 000000000..bc3b9fea6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/SimpleCallList.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.playground.gatk.walkers.papergenotyper; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.util.AbstractList; +import java.util.ArrayList; +import java.util.Collection; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Nov 19, 2009 + * Time: 12:50:20 AM + * + * A simple class, that dumps the records to disk when we've hit a threshold. + * This class makes the GATKPaperGenotyper much simpler to take in for the reader. + */ +class SimpleCallList extends AbstractList { + private File outputLocation; + private ArrayList list = new ArrayList(); + private int WRITE_LIMIT = 100000; + public SimpleCallList(File writeTo) { + outputLocation = writeTo; + } + + public boolean add(SimpleCall call) { + boolean added = list.add(call); + writeIfNeeded(); + return added; + } + + public boolean addAll(Collection otherCalls) { + boolean added = list.addAll(otherCalls); + writeIfNeeded(); + return added; + } + + public void writeIfNeeded() { + synchronized(list) { + if (list.size() > WRITE_LIMIT) { + try { + PrintWriter writer = new PrintWriter(new FileWriter(outputLocation, true)); + for (SimpleCall call : list) writer.println(call.toString()); + writer.close(); + } catch (IOException e) { + throw new StingException("Unable to write to file " + outputLocation); + } + list.clear(); + } + } + } + @Override + public int size() { + return list.size(); + } + + @Override + public SimpleCall get(int index) { + return list.get(index); + } + + public void close() { + WRITE_LIMIT = 0; + writeIfNeeded(); + } +} +