diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 76142dd40..3b83f09dc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -88,23 +88,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { logYMatrix[0][0] = 0.0; int j=0; - - // call clear ref sites separately to speed computation -/* - boolean isClearRefSite = true; - - int bestAFguess = 0; - for ( String sample : GLs.keySet() ) { - - double[] genotypeLikelihoods = GLs.get(sample).getLikelihoods(); - if (!(genotypeLikelihoods[0] > genotypeLikelihoods[1] && - genotypeLikelihoods[0] > genotypeLikelihoods[2])) - isClearRefSite = false; - - bestAFguess += MathUtils.maxElementIndex(genotypeLikelihoods); - - } - */ for ( Map.Entry sample : GLs.entrySet() ) { j++; @@ -121,9 +104,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; for (int k=1; k <= 2*j; k++ ) { - // if (k > 3 && isClearRefSite) - // break; - //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; double logNumerator[]; @@ -164,10 +144,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double softMax(double[] vec) { // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); + // return Math.log10(MathUtils.sumLog10(vec)); - // better approximation: do Jacobian logarithm function on data pairs - double a = softMaxPair(vec[0],vec[1]); + // better approximation: do Jacobian logarithm function on data pairs + double a = softMaxPair(vec[0],vec[1]); return softMaxPair(a,vec[2]); } @@ -178,32 +158,32 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if (Double.isInfinite(y)) return x; - if (y >= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - double diff = Math.abs(x-y); - double t1 =x; - if (y > x) - t1 = y; - // t has max(x,y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); - double t2 = jacobianLogTable[ind]; + if (y >= x + MAX_JACOBIAN_TOLERANCE) + return y; + if (x >= y + MAX_JACOBIAN_TOLERANCE) + return x; - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+t2; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + double diff = Math.abs(x-y); + double t1 =x; + if (y > x) + t1 = y; + // t has max(x,y) + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); + double t2 = jacobianLogTable[ind]; + + // gdebug+ + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + //gdebug- + return t1+t2; + // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java new file mode 100755 index 000000000..78d313171 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.SampleUtils; + +import java.util.*; + + +/** + * Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs). + * Absolutely not supported or recommended for public use. + */ +@Reference(window=@Window(start=-200,stop=200)) +@By(DataSource.READS) +@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) +public class UGCalcLikelihoods extends LocusWalker implements TreeReducible { + + @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + // control the output + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter writer = null; + + // the calculation arguments + private UnifiedGenotyperEngine UG_engine = null; + + // enable deletions in the pileup + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable extended events for indels + public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; } + + public void initialize() { + // get all of the unique sample names + // if we're supposed to assume a single sample, do so + Set samples = new TreeSet(); + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + samples.add(UAC.ASSUME_SINGLE_SAMPLE); + else + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); + + // initialize the header + Set headerInfo = new HashSet(); + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + + writer.writeHeader(new VCFHeader(headerInfo, samples)) ; + } + + public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + return new VariantCallContext(UG_engine.calculateLikelihoods(tracker, refContext, rawContext), refContext.getBase(), true); + } + + public Integer reduceInit() { return 0; } + + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + public Integer reduce(VariantCallContext value, Integer sum) { + if ( value == null ) + return sum; + + try { + writer.add(value.vc, value.refBase); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); + } + + return sum + 1; + } + + public void onTraversalDone(Integer sum) { + logger.info(String.format("Visited bases: %d", sum)); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java new file mode 100755 index 000000000..69cf09159 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.*; + +/** + * Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs). + * Absolutely not supported or recommended for public use. + */ +public class UGCallVariants extends RodWalker { + + @ArgumentCollection + private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + // control the output + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter writer = null; + + // the calculation arguments + private UnifiedGenotyperEngine UG_engine = null; + + // variant track names + private Set trackNames = new HashSet(); + + public void initialize() { + UAC.NO_SLOD = true; + + for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) { + if ( d.getName().startsWith("variant") ) + trackNames.add(d.getName()); + } + if ( trackNames.size() == 0 ) + throw new UserException("At least one track bound to a name beginning with 'variant' must be provided."); + + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames); + + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); + + Set headerInfo = new HashSet(); + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING || + UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING ) + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + + // initialize the header + writer.writeHeader(new VCFHeader(headerInfo, samples)); + } + + public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return null; + + + List VCs = new ArrayList(); + for ( String name : trackNames ) { + Collection vc = tracker.getVariantContexts(ref, name, null, context.getLocation(), true, true); + VCs.addAll(vc); + } + + VariantContext mergedVC = mergeVCsWithGLs(VCs); + if ( mergedVC == null ) + return null; + + return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC); + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(VariantCallContext value, Integer sum) { + if ( value == null || value.vc == null ) + return sum; + + try { + Map attrs = new HashMap(value.vc.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(value.vc, attrs, true); + writer.add(VariantContext.modifyAttributes(value.vc, attrs), value.refBase); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); + } + + return sum + 1; + } + + public void onTraversalDone(Integer result) { + logger.info(String.format("Visited sites: %d", result)); + } + + private static VariantContext mergeVCsWithGLs(List VCs) { + // we can't use the VCUtils classes because our VCs can all be no-calls + if ( VCs.size() == 0 ) + return null; + + VariantContext vc = VCs.get(0); + Map genotypes = new HashMap(getGenotypesWithGLs(vc.getGenotypes())); + for (int i = 1; i < VCs.size(); i++) + genotypes.putAll(getGenotypesWithGLs(VCs.get(i).getGenotypes())); + return new VariantContext("VCwithGLs", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null); + } + + private static Map getGenotypesWithGLs(Map genotypes) { + Map genotypesWithGLs = new HashMap(); + for ( Map.Entry g : genotypes.entrySet() ) { + if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null ) + genotypesWithGLs.put(g.getKey(), g.getValue()); + } + + return genotypesWithGLs; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 921595bd2..84a4ffed7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -448,6 +448,8 @@ public class UnifiedGenotyperEngine { } private VariantCallContext estimateReferenceConfidence(Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + if ( contexts == null ) + return null; double P_of_ref = initialPofRef;