Final, documented version of CalibrateGenotypeLikelihoods.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5966 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-08 20:22:28 +00:00
parent 44287ea8dc
commit 93d6e17762
1 changed files with 42 additions and 25 deletions

View File

@ -28,26 +28,19 @@ package org.broadinstitute.sting.oneoffprojects.walkers;
import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMReadGroupRecord;
import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.MutableVariantContext;
import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.*; import java.util.*;
@ -55,11 +48,11 @@ import java.util.*;
import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
/** /**
* Validates the calls on a ROD track using a BAM dataset. * Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes
* *
* @author carneiro * @author depristo
* @since Mar 3, 2011 * @since May, 2011
* @help.summary Validates the calls on a ROD track using a BAM dataset. * @help.summary Computes raw GL calibration data for read groups in BAMs against a comp VCF track of genotypes
*/ */
@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="alleles",type=VariantContext.class)) @Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="alleles",type=VariantContext.class))
@ -85,6 +78,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
Set<String> samples; Set<String> samples;
/**
* Trivial wrapper class. Data is a collection of Datum.
*/
public static class Data { public static class Data {
Collection<Datum> values; Collection<Datum> values;
@ -94,12 +90,18 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
final public static Data EMPTY_DATA = new Data(Collections.<Datum>emptyList()); final public static Data EMPTY_DATA = new Data(Collections.<Datum>emptyList());
} }
/**
* The raw datapoints we are tracking for a specific site for a specific sample.
* read group id and sample name. The PL object.
* the ref and alt alleles. The type of the variant context. And the genotype of the
* comp. track at this site.
*/
public static class Datum implements Comparable<Datum> { public static class Datum implements Comparable<Datum> {
String rgID, sample; final String rgID, sample;
GenotypeLikelihoods pl; final GenotypeLikelihoods pl;
String ref, alt; final String ref, alt;
VariantContext.Type siteType; final VariantContext.Type siteType;
Genotype.Type genotypeType; final Genotype.Type genotypeType;
@Override @Override
public int compareTo(Datum o) { public int compareTo(Datum o) {
@ -129,8 +131,11 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public void initialize() { public void initialize() {
// We only operate over the samples in the BAM file
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
logger.info("Samples: " + samples); logger.info("Samples: " + samples);
if ( samples.size() > 1 ) // todo -- remove me when we support multiple samples
throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support comparison of multiple samples simulatenously. To enable, see TODO in code");
List<ReferenceOrderedDataSource> rodList = this.getToolkit().getRodDataSources(); List<ReferenceOrderedDataSource> rodList = this.getToolkit().getRodDataSources();
if ( rodList.size() != 1 ) if ( rodList.size() != 1 )
@ -143,12 +148,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
uac.NO_SLOD = true; uac.NO_SLOD = true;
if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
@ -158,6 +160,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
} }
@Override @Override
// todo -- remove me when the new indel genotyping is done
public boolean generateExtendedEvents() { return true; } public boolean generateExtendedEvents() { return true; }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -169,26 +172,29 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
if ( tracker == null || tracker.getNBoundRodTracks() == 0 ) if ( tracker == null || tracker.getNBoundRodTracks() == 0 )
return Data.EMPTY_DATA; return Data.EMPTY_DATA;
// Grabs a usable VariantContext from the Alleles ROD
VariantContext vcComp = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger); VariantContext vcComp = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger);
if( vcComp == null ) if( vcComp == null )
return Data.EMPTY_DATA; return Data.EMPTY_DATA;
//todo - not sure I want this, may be misleading to filter extended indel events.
if (isInsideExtendedIndel(vcComp, ref))
return Data.EMPTY_DATA;
Data data = new Data(); Data data = new Data();
for ( String sample : samples ) { for ( String sample : samples ) {
// What's the genotype of our sample at this record?
Genotype compGT = getGenotype(tracker, ref, sample, COMP_NAME); Genotype compGT = getGenotype(tracker, ref, sample, COMP_NAME);
if ( compGT == null || compGT.isNoCall() ) if ( compGT == null || compGT.isNoCall() )
continue; continue;
// For each read group
// todo -- this only works with a single sample right now. For multi-sample BAMs
// todo -- this loop needs to be refactored so that the spliting by read group only happens once
// todo -- and the read groups appropriate to each sample is used.
Map<SAMReadGroupRecord,AlignmentContext> byRG = AlignmentContextUtils.splitContextByReadGroup(context, getToolkit().getSAMFileHeader().getReadGroups()); Map<SAMReadGroupRecord,AlignmentContext> byRG = AlignmentContextUtils.splitContextByReadGroup(context, getToolkit().getSAMFileHeader().getReadGroups());
//byRG.put(new SAMReadGroupRecord("ALL"), context); //byRG.put(new SAMReadGroupRecord("ALL"), context); // uncomment to include a synthetic RG for all RG for the sample
for ( Map.Entry<SAMReadGroupRecord, AlignmentContext> rgAC : byRG.entrySet() ) { for ( Map.Entry<SAMReadGroupRecord, AlignmentContext> rgAC : byRG.entrySet() ) {
VariantCallContext call; VariantCallContext call;
if ( vcComp.isIndel() ) { if ( vcComp.isIndel() ) {
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); throw new UserException.BadInput("CalibrateGenotypeLikelihoods does not currently support indel GL calibration. This capability needs to be tested and verified to be working with the new genotyping code for indels in UG");
//call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue());
} else { } else {
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue()); call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, rgAC.getValue());
} }
@ -209,6 +215,15 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
return data; return data;
} }
/**
* Convenience function that determines the genotype in the comp VC for sample
*
* @param tracker
* @param ref
* @param sample
* @param rod
* @return
*/
private Genotype getGenotype(RefMetaDataTracker tracker, ReferenceContext ref, String sample, String rod) { private Genotype getGenotype(RefMetaDataTracker tracker, ReferenceContext ref, String sample, String rod) {
for ( VariantContext vc : tracker.getVariantContexts(ref, rod, null, ref.getLocus(), true, false) ) { for ( VariantContext vc : tracker.getVariantContexts(ref, rod, null, ref.getLocus(), true, false) ) {
if ( vc.isNotFiltered() && vc.hasGenotype(sample) ) if ( vc.isNotFiltered() && vc.hasGenotype(sample) )
@ -249,6 +264,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD"); List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
out.println(Utils.join("\t", fields)); out.println(Utils.join("\t", fields));
// determine the priors by counting all of the events we've seen in comp
double[] counts = new double[]{1, 1, 1}; double[] counts = new double[]{1, 1, 1};
for ( Datum d : data.values ) { counts[d.genotypeType.ordinal()-1]++; } for ( Datum d : data.values ) { counts[d.genotypeType.ordinal()-1]++; }
double sum = MathUtils.sum(counts); double sum = MathUtils.sum(counts);
@ -257,6 +273,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
double[] log10priors = new double[]{Math.log10(counts[0] / sum), Math.log10(counts[1] / sum), Math.log10(counts[2] / sum)}; double[] log10priors = new double[]{Math.log10(counts[0] / sum), Math.log10(counts[1] / sum), Math.log10(counts[2] / sum)};
logger.info(String.format("Priors %.2f %.2f %.2f", log10priors[0], log10priors[1], log10priors[2])); logger.info(String.format("Priors %.2f %.2f %.2f", log10priors[0], log10priors[1], log10priors[2]));
// emit the molten data set
for ( Datum d : data.values ) { for ( Datum d : data.values ) {
double[] log10pGGivenD = d.pl.getAsVector().clone(); double[] log10pGGivenD = d.pl.getAsVector().clone();
for ( int i = 0; i < log10priors.length; i++ ) log10pGGivenD[i] += log10priors[i]; for ( int i = 0; i < log10priors.length; i++ ) log10pGGivenD[i] += log10priors[i];