Adding UGCalcLikelihoods and UGCallVariants so that GSA members can break up the calling process into separate steps (calculate the GLs and then call off of those) - useful for Chris's new batch merger. As the docs say, these are absolutely not supported or recommended for public use.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4764 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b4ef716aaf
commit
0d1c905df3
|
|
@ -88,23 +88,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
logYMatrix[0][0] = 0.0;
|
||||
int j=0;
|
||||
|
||||
|
||||
// call clear ref sites separately to speed computation
|
||||
/*
|
||||
boolean isClearRefSite = true;
|
||||
|
||||
int bestAFguess = 0;
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
|
||||
double[] genotypeLikelihoods = GLs.get(sample).getLikelihoods();
|
||||
if (!(genotypeLikelihoods[0] > genotypeLikelihoods[1] &&
|
||||
genotypeLikelihoods[0] > genotypeLikelihoods[2]))
|
||||
isClearRefSite = false;
|
||||
|
||||
bestAFguess += MathUtils.maxElementIndex(genotypeLikelihoods);
|
||||
|
||||
}
|
||||
*/
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
j++;
|
||||
|
||||
|
|
@ -121,9 +104,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
|
||||
for (int k=1; k <= 2*j; k++ ) {
|
||||
// if (k > 3 && isClearRefSite)
|
||||
// break;
|
||||
|
||||
|
||||
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
double logNumerator[];
|
||||
|
|
@ -164,10 +144,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
double softMax(double[] vec) {
|
||||
// compute naively log10(10^x[0] + 10^x[1]+...)
|
||||
// return Math.log10(MathUtils.sumLog10(vec));
|
||||
// return Math.log10(MathUtils.sumLog10(vec));
|
||||
|
||||
// better approximation: do Jacobian logarithm function on data pairs
|
||||
double a = softMaxPair(vec[0],vec[1]);
|
||||
// better approximation: do Jacobian logarithm function on data pairs
|
||||
double a = softMaxPair(vec[0],vec[1]);
|
||||
return softMaxPair(a,vec[2]);
|
||||
}
|
||||
|
||||
|
|
@ -178,32 +158,32 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if (Double.isInfinite(y))
|
||||
return x;
|
||||
|
||||
if (y >= x + MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
if (x >= y + MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
double diff = Math.abs(x-y);
|
||||
double t1 =x;
|
||||
if (y > x)
|
||||
t1 = y;
|
||||
// t has max(x,y)
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
|
||||
double t2 = jacobianLogTable[ind];
|
||||
if (y >= x + MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
if (x >= y + MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
|
||||
// gdebug+
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
//gdebug-
|
||||
return t1+t2;
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
double diff = Math.abs(x-y);
|
||||
double t1 =x;
|
||||
if (y > x)
|
||||
t1 = y;
|
||||
// t has max(x,y)
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
|
||||
double t2 = jacobianLogTable[ind];
|
||||
|
||||
// gdebug+
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
//gdebug-
|
||||
return t1+t2;
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
*/
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.READS)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||
|
||||
public void initialize() {
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample, do so
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
// initialize the header
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
return new VariantCallContext(UG_engine.calculateLikelihoods(tracker, refContext, rawContext), refContext.getBase(), true);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
writer.add(value.vc, value.refBase);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
logger.info(String.format("Visited bases: %d", sum));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
*/
|
||||
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// variant track names
|
||||
private Set<String> trackNames = new HashSet<String>();
|
||||
|
||||
public void initialize() {
|
||||
UAC.NO_SLOD = true;
|
||||
|
||||
for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) {
|
||||
if ( d.getName().startsWith("variant") )
|
||||
trackNames.add(d.getName());
|
||||
}
|
||||
if ( trackNames.size() == 0 )
|
||||
throw new UserException("At least one track bound to a name beginning with 'variant' must be provided.");
|
||||
|
||||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
|
||||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
|
||||
List<VariantContext> VCs = new ArrayList<VariantContext>();
|
||||
for ( String name : trackNames ) {
|
||||
Collection<VariantContext> vc = tracker.getVariantContexts(ref, name, null, context.getLocation(), true, true);
|
||||
VCs.addAll(vc);
|
||||
}
|
||||
|
||||
VariantContext mergedVC = mergeVCsWithGLs(VCs);
|
||||
if ( mergedVC == null )
|
||||
return null;
|
||||
|
||||
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null || value.vc == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(value.vc.getAttributes());
|
||||
VariantContextUtils.calculateChromosomeCounts(value.vc, attrs, true);
|
||||
writer.add(VariantContext.modifyAttributes(value.vc, attrs), value.refBase);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info(String.format("Visited sites: %d", result));
|
||||
}
|
||||
|
||||
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
|
||||
// we can't use the VCUtils classes because our VCs can all be no-calls
|
||||
if ( VCs.size() == 0 )
|
||||
return null;
|
||||
|
||||
VariantContext vc = VCs.get(0);
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(getGenotypesWithGLs(vc.getGenotypes()));
|
||||
for (int i = 1; i < VCs.size(); i++)
|
||||
genotypes.putAll(getGenotypesWithGLs(VCs.get(i).getGenotypes()));
|
||||
return new VariantContext("VCwithGLs", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null);
|
||||
}
|
||||
|
||||
private static Map<String, Genotype> getGenotypesWithGLs(Map<String, Genotype> genotypes) {
|
||||
Map<String, Genotype> genotypesWithGLs = new HashMap<String, Genotype>();
|
||||
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
|
||||
if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null )
|
||||
genotypesWithGLs.put(g.getKey(), g.getValue());
|
||||
}
|
||||
|
||||
return genotypesWithGLs;
|
||||
}
|
||||
}
|
||||
|
|
@ -448,6 +448,8 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
double P_of_ref = initialPofRef;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue