Adding UGCalcLikelihoods and UGCallVariants so that GSA members can break up the calling process into separate steps (calculate the GLs and then call off of those) - useful for Chris's new batch merger. As the docs say, these are absolutely not supported or recommended for public use.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4764 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-12-01 17:32:26 +00:00
parent b4ef716aaf
commit 0d1c905df3
4 changed files with 294 additions and 48 deletions

View File

@ -88,23 +88,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
logYMatrix[0][0] = 0.0;
int j=0;
// call clear ref sites separately to speed computation
/*
boolean isClearRefSite = true;
int bestAFguess = 0;
for ( String sample : GLs.keySet() ) {
double[] genotypeLikelihoods = GLs.get(sample).getLikelihoods();
if (!(genotypeLikelihoods[0] > genotypeLikelihoods[1] &&
genotypeLikelihoods[0] > genotypeLikelihoods[2]))
isClearRefSite = false;
bestAFguess += MathUtils.maxElementIndex(genotypeLikelihoods);
}
*/
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
@ -121,9 +104,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
for (int k=1; k <= 2*j; k++ ) {
// if (k > 3 && isClearRefSite)
// break;
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
double logNumerator[];
@ -164,10 +144,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
double softMax(double[] vec) {
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
// return Math.log10(MathUtils.sumLog10(vec));
// better approximation: do Jacobian logarithm function on data pairs
double a = softMaxPair(vec[0],vec[1]);
// better approximation: do Jacobian logarithm function on data pairs
double a = softMaxPair(vec[0],vec[1]);
return softMaxPair(a,vec[2]);
}
@ -178,32 +158,32 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if (Double.isInfinite(y))
return x;
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}

View File

@ -0,0 +1,112 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.SampleUtils;
import java.util.*;
/**
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
* Absolutely not supported or recommended for public use.
*/
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.READS)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
public void initialize() {
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
Set<String> samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
// initialize the header
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return new VariantCallContext(UG_engine.calculateLikelihoods(tracker, refContext, rawContext), refContext.getBase(), true);
}
public Integer reduceInit() { return 0; }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
writer.add(value.vc, value.refBase);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer sum) {
logger.info(String.format("Visited bases: %d", sum));
}
}

View File

@ -0,0 +1,152 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
/**
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
* Absolutely not supported or recommended for public use.
*/
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// variant track names
private Set<String> trackNames = new HashSet<String>();
public void initialize() {
UAC.NO_SLOD = true;
for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) {
if ( d.getName().startsWith("variant") )
trackNames.add(d.getName());
}
if ( trackNames.size() == 0 )
throw new UserException("At least one track bound to a name beginning with 'variant' must be provided.");
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
// initialize the header
writer.writeHeader(new VCFHeader(headerInfo, samples));
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
List<VariantContext> VCs = new ArrayList<VariantContext>();
for ( String name : trackNames ) {
Collection<VariantContext> vc = tracker.getVariantContexts(ref, name, null, context.getLocation(), true, true);
VCs.addAll(vc);
}
VariantContext mergedVC = mergeVCsWithGLs(VCs);
if ( mergedVC == null )
return null;
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
}
public Integer reduceInit() { return 0; }
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null || value.vc == null )
return sum;
try {
Map<String, Object> attrs = new HashMap<String, Object>(value.vc.getAttributes());
VariantContextUtils.calculateChromosomeCounts(value.vc, attrs, true);
writer.add(VariantContext.modifyAttributes(value.vc, attrs), value.refBase);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer result) {
logger.info(String.format("Visited sites: %d", result));
}
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
// we can't use the VCUtils classes because our VCs can all be no-calls
if ( VCs.size() == 0 )
return null;
VariantContext vc = VCs.get(0);
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(getGenotypesWithGLs(vc.getGenotypes()));
for (int i = 1; i < VCs.size(); i++)
genotypes.putAll(getGenotypesWithGLs(VCs.get(i).getGenotypes()));
return new VariantContext("VCwithGLs", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null);
}
private static Map<String, Genotype> getGenotypesWithGLs(Map<String, Genotype> genotypes) {
Map<String, Genotype> genotypesWithGLs = new HashMap<String, Genotype>();
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null )
genotypesWithGLs.put(g.getKey(), g.getValue());
}
return genotypesWithGLs;
}
}

View File

@ -448,6 +448,8 @@ public class UnifiedGenotyperEngine {
}
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
if ( contexts == null )
return null;
double P_of_ref = initialPofRef;