From 40fbeafa3789abee8eeb84eba467e2be600bc110 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 11 Nov 2011 11:52:30 -0500 Subject: [PATCH 01/12] VQSR will now detect if the negative model failed to converge properly because of having too few data points and automatically retry with more appropriate clustering parameters. --- .../GaussianMixtureModel.java | 1 + .../VariantRecalibrator.java | 17 ++++++++++++++++- .../VariantRecalibratorEngine.java | 10 ++++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 3fa9c3883..82776ca2e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -52,6 +52,7 @@ public class GaussianMixtureModel { private final double[] empiricalMu; private final Matrix empiricalSigma; public boolean isModelReadyForEvaluation; + public boolean failedToConverge = false; public GaussianMixtureModel( final int numGaussians, final int numAnnotations, final double shrinkage, final double dirichletParameter, final double priorCounts ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index f60a94a22..20b000978 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -309,8 +309,23 @@ public class VariantRecalibrator extends RodWalker negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ); + GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData ); engine.evaluateData( dataManager.getData(), badModel, true ); + + // Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again + while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) { + logger.info("Negative model failed to converge. Retrying..."); + VRAC.MAX_GAUSSIANS--; + badModel = engine.generateModel( negativeTrainingData ); + engine.evaluateData( dataManager.getData(), goodModel, false ); + engine.evaluateData( dataManager.getData(), badModel, true ); + } + + if( badModel.failedToConverge || goodModel.failedToConverge ) { + throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); + } + engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index adfb38a25..6d2ac643b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -67,14 +67,20 @@ public class VariantRecalibratorEngine { public void evaluateData( final List data, final GaussianMixtureModel model, final boolean evaluateContrastively ) { if( !model.isModelReadyForEvaluation ) { - model.precomputeDenominatorForEvaluation(); + try { + model.precomputeDenominatorForEvaluation(); + } catch( Exception e ) { + model.failedToConverge = true; + return; + } } logger.info("Evaluating full set of " + data.size() + " variants..."); for( final VariantDatum datum : data ) { final double thisLod = evaluateDatum( datum, model ); if( Double.isNaN(thisLod) ) { - throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); + model.failedToConverge = true; + return; } datum.lod = ( evaluateContrastively ? From cd3146f4cff4bc81aee7fd728dfb9c9a080e4247 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 11 Nov 2011 14:07:07 -0500 Subject: [PATCH 02/12] Add hidden option to ValidationAmplicons to output slightly modified format to make file work with downstream SQNM tools more seamlessly at request of GAP: one line per record, keep probe identifier to 20 characters, no * in ref allele. --- .../validation/ValidationAmplicons.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 035d8d2ca..088c4ddc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.alignment.Alignment; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -126,6 +123,11 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false) boolean doNotUseBWA = false; + @Hidden + @Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false) + boolean sequenomOutput = false; + + GenomeLoc prevInterval; GenomeLoc allelePos; String probeName; @@ -485,6 +487,13 @@ public class ValidationAmplicons extends RodWalker { } String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + + if (!sequenomOutput) + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + else { + seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record + probeName = probeName.replace("amplicon_","a"); + out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + } } } From 76d357be408d4f58348aafc32b6df63c59d79825 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 12 Nov 2011 23:20:05 -0500 Subject: [PATCH 05/12] Updating docs example to use -L since that's best practice --- .../sting/gatk/walkers/annotator/VariantAnnotator.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 087218cfb..58762c054 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -70,6 +70,7 @@ import java.util.*; * -o output.vcf \ * -A DepthOfCoverage * --variant input.vcf \ + * -L input.vcf * --dbsnp dbsnp.vcf * * From b7c33116af89ae3b5cf070fb1f461ac3d11a8746 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 12 Nov 2011 23:21:07 -0500 Subject: [PATCH 06/12] Minor docs update --- .../sting/gatk/walkers/annotator/VariantAnnotator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 58762c054..ea11391d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -68,9 +68,9 @@ import java.util.*; * -T VariantAnnotator \ * -I input.bam \ * -o output.vcf \ - * -A DepthOfCoverage + * -A DepthOfCoverage \ * --variant input.vcf \ - * -L input.vcf + * -L input.vcf \ * --dbsnp dbsnp.vcf * * From 3d2970453b79fff8cd89dd5e5984d3cacb9b58ea Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 14 Nov 2011 09:41:54 -0500 Subject: [PATCH 07/12] Misc minor cleanup --- .../sting/gatk/walkers/recalibration/CycleCovariate.java | 2 -- .../org/broadinstitute/sting/utils/pileup/PileupElement.java | 4 ++-- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index e10334a77..6b4fec04e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.Arrays; import java.util.EnumSet; -import java.util.List; /* * Copyright (c) 2009 The Broad Institute diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index daf6606ef..bab20b9e8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -95,11 +95,11 @@ public class PileupElement implements Comparable { // -------------------------------------------------------------------------- public boolean isReducedRead() { - return ((GATKSAMRecord)read).isReducedRead(); + return read.isReducedRead(); } public int getRepresentativeCount() { - return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; + return isReducedRead() ? read.getReducedCount(offset) : 1; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index e125b8c80..8d9018045 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -243,7 +243,7 @@ public class ReadUtils { public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - GATKSAMRecord result = (GATKSAMRecord)read; + GATKSAMRecord result = read; if ( adaptorBoundaries != null ) { if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) From 7aee80cd3b909c4982d932f2a9bea6ab762b408d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 14 Nov 2011 12:23:46 -0500 Subject: [PATCH 08/12] Fix to deal with reduced reads containing a deletion --- .../org/broadinstitute/sting/utils/pileup/PileupElement.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index bab20b9e8..2d13d6e59 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -99,7 +99,8 @@ public class PileupElement implements Comparable { } public int getRepresentativeCount() { - return isReducedRead() ? read.getReducedCount(offset) : 1; + // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed + return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1; } } \ No newline at end of file From 7b2a7cfbe763cfa8aeca37205dfe9b1ffedee094 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 14 Nov 2011 14:31:27 -0500 Subject: [PATCH 10/12] Transfer headers from the resource VCF when possible when using expressions. While there, VA was modified so that it didn't assume that the ID field was present in the VC's info map in preparation for Mark's upcoming changes. --- .../walkers/annotator/VariantAnnotator.java | 25 +++++++++++++++++-- .../annotator/VariantAnnotatorEngine.java | 23 +++++++++++------ .../VariantAnnotatorIntegrationTest.java | 2 +- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index ea11391d9..20e72dd57 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -222,8 +222,29 @@ public class VariantAnnotator extends RodWalker implements Ann if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } - for ( String expression : expressionsToUse ) - hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + // for the expressions, pull the info header line from the header of the resource rod + for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + // special case the ID field + if ( expression.fieldName.equals("ID") ) { + hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); + continue; + } + VCFInfoHeaderLine targetHeaderLine = null; + for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + if ( line instanceof VCFInfoHeaderLine ) { + VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + if ( infoline.getName().equals(expression.fieldName) ) { + targetHeaderLine = infoline; + break; + } + } + } + + if ( targetHeaderLine != null ) + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + else + hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + } engine.invokeAnnotationInitializationMethods(hInfo); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index e4bc0d5d5..20f28007a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -49,20 +49,20 @@ public class VariantAnnotatorEngine { private AnnotatorCompatibleWalker walker; private GenomeAnalysisEngine toolkit; - private static class VAExpression { + protected static class VAExpression { public String fullName, fieldName; public RodBinding binding; - public VAExpression(String fullEpression, List> bindings) { - int indexOfDot = fullEpression.lastIndexOf("."); + public VAExpression(String fullExpression, List> bindings) { + int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) - throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); + throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); - fullName = fullEpression; - fieldName = fullEpression.substring(indexOfDot+1); + fullName = fullExpression; + fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullEpression.substring(0, indexOfDot); + String bindingName = fullExpression.substring(0, indexOfDot); for ( RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; @@ -97,6 +97,8 @@ public class VariantAnnotatorEngine { requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } + protected List getRequestedExpressions() { return requestedExpressions; } + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); @@ -211,8 +213,13 @@ public class VariantAnnotatorEngine { continue; VariantContext vc = VCs.iterator().next(); - if ( vc.hasAttribute(expression.fieldName) ) + // special-case the ID field + if ( expression.fieldName.equals("ID") ) { + if ( vc.hasID() ) + infoAnnotations.put(expression.fullName, vc.getID()); + } else if ( vc.hasAttribute(expression.fieldName) ) { infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 189f643d4..bde4c4a8f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("4a6f0675242f685e9072c1da5ad9e715")); + Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5")); executeTest("using expression with ID", spec); } From 4dc9dbe890480eea992d89f740fe20f06bd2a086 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 14 Nov 2011 14:42:12 -0500 Subject: [PATCH 11/12] One quick fix to previous commit --- .../sting/gatk/walkers/annotator/VariantAnnotator.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 20e72dd57..c9ea7a3b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -240,10 +240,14 @@ public class VariantAnnotator extends RodWalker implements Ann } } - if ( targetHeaderLine != null ) - hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); - else + if ( targetHeaderLine != null ) { + if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER ) + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + else + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + } else { hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + } } engine.invokeAnnotationInitializationMethods(hInfo);