Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Mark DePristo 2011-11-14 15:32:06 -05:00
commit 2e9d5363e7
10 changed files with 88 additions and 25 deletions

View File

@ -68,8 +68,9 @@ import java.util.*;
* -T VariantAnnotator \ * -T VariantAnnotator \
* -I input.bam \ * -I input.bam \
* -o output.vcf \ * -o output.vcf \
* -A DepthOfCoverage * -A DepthOfCoverage \
* --variant input.vcf \ * --variant input.vcf \
* -L input.vcf \
* --dbsnp dbsnp.vcf * --dbsnp dbsnp.vcf
* </pre> * </pre>
* *
@ -221,8 +222,33 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
if ( isUniqueHeaderLine(line, hInfo) ) if ( isUniqueHeaderLine(line, hInfo) )
hInfo.add(line); hInfo.add(line);
} }
for ( String expression : expressionsToUse ) // for the expressions, pull the info header line from the header of the resource rod
hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
// special case the ID field
if ( expression.fieldName.equals("ID") ) {
hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
continue;
}
VCFInfoHeaderLine targetHeaderLine = null;
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
if ( infoline.getName().equals(expression.fieldName) ) {
targetHeaderLine = infoline;
break;
}
}
}
if ( targetHeaderLine != null ) {
if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER )
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
else
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
} else {
hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
}
}
engine.invokeAnnotationInitializationMethods(hInfo); engine.invokeAnnotationInitializationMethods(hInfo);

View File

@ -50,20 +50,20 @@ public class VariantAnnotatorEngine {
private AnnotatorCompatibleWalker walker; private AnnotatorCompatibleWalker walker;
private GenomeAnalysisEngine toolkit; private GenomeAnalysisEngine toolkit;
private static class VAExpression { protected static class VAExpression {
public String fullName, fieldName; public String fullName, fieldName;
public RodBinding<VariantContext> binding; public RodBinding<VariantContext> binding;
public VAExpression(String fullEpression, List<RodBinding<VariantContext>> bindings) { public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
int indexOfDot = fullEpression.lastIndexOf("."); int indexOfDot = fullExpression.lastIndexOf(".");
if ( indexOfDot == -1 ) if ( indexOfDot == -1 )
throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
fullName = fullEpression; fullName = fullExpression;
fieldName = fullEpression.substring(indexOfDot+1); fieldName = fullExpression.substring(indexOfDot+1);
String bindingName = fullEpression.substring(0, indexOfDot); String bindingName = fullExpression.substring(0, indexOfDot);
for ( RodBinding<VariantContext> rod : bindings ) { for ( RodBinding<VariantContext> rod : bindings ) {
if ( rod.getName().equals(bindingName) ) { if ( rod.getName().equals(bindingName) ) {
binding = rod; binding = rod;
@ -98,6 +98,8 @@ public class VariantAnnotatorEngine {
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
} }
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) { private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse);
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse);
@ -212,8 +214,13 @@ public class VariantAnnotatorEngine {
continue; continue;
VariantContext vc = VCs.iterator().next(); VariantContext vc = VCs.iterator().next();
if ( vc.hasAttribute(expression.fieldName) ) // special-case the ID field
if ( expression.fieldName.equals("ID") ) {
if ( vc.hasID() )
infoAnnotations.put(expression.fullName, vc.getID());
} else if ( vc.hasAttribute(expression.fieldName) ) {
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
}
} }
} }

View File

@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute

View File

@ -7,10 +7,7 @@ import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -126,6 +123,11 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false) @Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
boolean doNotUseBWA = false; boolean doNotUseBWA = false;
@Hidden
@Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false)
boolean sequenomOutput = false;
GenomeLoc prevInterval; GenomeLoc prevInterval;
GenomeLoc allelePos; GenomeLoc allelePos;
String probeName; String probeName;
@ -485,6 +487,13 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
} }
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
if (!sequenomOutput)
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
else {
seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
probeName = probeName.replace("amplicon_","a");
out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
}
} }
} }

View File

@ -52,6 +52,7 @@ public class GaussianMixtureModel {
private final double[] empiricalMu; private final double[] empiricalMu;
private final Matrix empiricalSigma; private final Matrix empiricalSigma;
public boolean isModelReadyForEvaluation; public boolean isModelReadyForEvaluation;
public boolean failedToConverge = false;
public GaussianMixtureModel( final int numGaussians, final int numAnnotations, public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
final double shrinkage, final double dirichletParameter, final double priorCounts ) { final double shrinkage, final double dirichletParameter, final double priorCounts ) {

View File

@ -309,8 +309,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
engine.evaluateData( dataManager.getData(), goodModel, false ); engine.evaluateData( dataManager.getData(), goodModel, false );
// Generate the negative model using the worst performing data and evaluate each variant contrastively // Generate the negative model using the worst performing data and evaluate each variant contrastively
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) ); final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS );
GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData );
engine.evaluateData( dataManager.getData(), badModel, true ); engine.evaluateData( dataManager.getData(), badModel, true );
// Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again
while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) {
logger.info("Negative model failed to converge. Retrying...");
VRAC.MAX_GAUSSIANS--;
badModel = engine.generateModel( negativeTrainingData );
engine.evaluateData( dataManager.getData(), goodModel, false );
engine.evaluateData( dataManager.getData(), badModel, true );
}
if( badModel.failedToConverge || goodModel.failedToConverge ) {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
}
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
// Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user

View File

@ -67,14 +67,20 @@ public class VariantRecalibratorEngine {
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) { public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
if( !model.isModelReadyForEvaluation ) { if( !model.isModelReadyForEvaluation ) {
model.precomputeDenominatorForEvaluation(); try {
model.precomputeDenominatorForEvaluation();
} catch( Exception e ) {
model.failedToConverge = true;
return;
}
} }
logger.info("Evaluating full set of " + data.size() + " variants..."); logger.info("Evaluating full set of " + data.size() + " variants...");
for( final VariantDatum datum : data ) { for( final VariantDatum datum : data ) {
final double thisLod = evaluateDatum( datum, model ); final double thisLod = evaluateDatum( datum, model );
if( Double.isNaN(thisLod) ) { if( Double.isNaN(thisLod) ) {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); model.failedToConverge = true;
return;
} }
datum.lod = ( evaluateContrastively ? datum.lod = ( evaluateContrastively ?

View File

@ -95,11 +95,12 @@ public class PileupElement implements Comparable<PileupElement> {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
public boolean isReducedRead() { public boolean isReducedRead() {
return ((GATKSAMRecord)read).isReducedRead(); return read.isReducedRead();
} }
public int getRepresentativeCount() { public int getRepresentativeCount() {
return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed
return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1;
} }
} }

View File

@ -243,7 +243,7 @@ public class ReadUtils {
public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) {
Pair<Integer, Integer> adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); Pair<Integer, Integer> adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength);
GATKSAMRecord result = (GATKSAMRecord)read; GATKSAMRecord result = read;
if ( adaptorBoundaries != null ) { if ( adaptorBoundaries != null ) {
if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() )

View File

@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testUsingExpressionWithID() { public void testUsingExpressionWithID() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
Arrays.asList("4a6f0675242f685e9072c1da5ad9e715")); Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5"));
executeTest("using expression with ID", spec); executeTest("using expression with ID", spec);
} }