From 93d6e17762bdaeab6be7d988252a1827db0994cf Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 8 Jun 2011 20:22:28 +0000 Subject: [PATCH] Final, documented version of CalibrateGenotypeLikelihoods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5966 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/CalibrateGenotypeLikelihoods.java | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java index db2d7fc31..ff0a9696c 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java @@ -28,26 +28,19 @@ package org.broadinstitute.sting.oneoffprojects.walkers; import net.sf.samtools.SAMReadGroupRecord; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; -import org.broad.tribble.util.variantcontext.MutableVariantContext; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFWriter; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.PrintStream; import java.util.*; @@ -55,11 +48,11 @@ import java.util.*; import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; /** - * Validates the calls on a ROD track using a BAM dataset. + * Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes * - * @author carneiro - * @since Mar 3, 2011 - * @help.summary Validates the calls on a ROD track using a BAM dataset. + * @author depristo + * @since May, 2011 + * @help.summary Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes */ @Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="alleles",type=VariantContext.class)) @@ -85,6 +78,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker samples; + /** + * Trivial wrapper class. Data is a collection of Datum. + */ public static class Data { Collection values; @@ -94,12 +90,18 @@ public class CalibrateGenotypeLikelihoods extends RodWalkeremptyList()); } + /** + * The raw datapoints we are tracking for a specific site for a specific sample. + * read group id and sample name. The PL object. + * the ref and alt alleles. The type of the variant context. And the genotype of the + * comp. track at this site. + */ public static class Datum implements Comparable { - String rgID, sample; - GenotypeLikelihoods pl; - String ref, alt; - VariantContext.Type siteType; - Genotype.Type genotypeType; + final String rgID, sample; + final GenotypeLikelihoods pl; + final String ref, alt; + final VariantContext.Type siteType; + final Genotype.Type genotypeType; @Override public int compareTo(Datum o) { @@ -129,8 +131,11 @@ public class CalibrateGenotypeLikelihoods extends RodWalker 1 ) // todo -- remove me when we support multiple samples + throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support comparison of multiple samples simulatenously. To enable, see TODO in code"); List rodList = this.getToolkit().getRodDataSources(); if ( rodList.size() != 1 ) @@ -143,12 +148,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; - uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); @@ -158,6 +160,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker byRG = AlignmentContextUtils.splitContextByReadGroup(context, getToolkit().getSAMFileHeader().getReadGroups()); - //byRG.put(new SAMReadGroupRecord("ALL"), context); + //byRG.put(new SAMReadGroupRecord("ALL"), context); // uncomment to include a synthetic RG for all RG for the sample for ( Map.Entry rgAC : byRG.entrySet() ) { VariantCallContext call; if ( vcComp.isIndel() ) { - call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); + throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support indel GL calibration. This capability needs to be tested and verified to be working with the new genotyping code for indels in UG"); + //call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); } else { call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); } @@ -209,6 +215,15 @@ public class CalibrateGenotypeLikelihoods extends RodWalker fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD"); out.println(Utils.join("\t", fields)); + // determine the priors by counting all of the events we've seen in comp double[] counts = new double[]{1, 1, 1}; for ( Datum d : data.values ) { counts[d.genotypeType.ordinal()-1]++; } double sum = MathUtils.sum(counts); @@ -257,6 +273,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker