Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
2e9d5363e7
|
|
@ -68,8 +68,9 @@ import java.util.*;
|
||||||
* -T VariantAnnotator \
|
* -T VariantAnnotator \
|
||||||
* -I input.bam \
|
* -I input.bam \
|
||||||
* -o output.vcf \
|
* -o output.vcf \
|
||||||
* -A DepthOfCoverage
|
* -A DepthOfCoverage \
|
||||||
* --variant input.vcf \
|
* --variant input.vcf \
|
||||||
|
* -L input.vcf \
|
||||||
* --dbsnp dbsnp.vcf
|
* --dbsnp dbsnp.vcf
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
|
|
@ -221,8 +222,33 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
||||||
if ( isUniqueHeaderLine(line, hInfo) )
|
if ( isUniqueHeaderLine(line, hInfo) )
|
||||||
hInfo.add(line);
|
hInfo.add(line);
|
||||||
}
|
}
|
||||||
for ( String expression : expressionsToUse )
|
// for the expressions, pull the info header line from the header of the resource rod
|
||||||
hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
|
||||||
|
// special case the ID field
|
||||||
|
if ( expression.fieldName.equals("ID") ) {
|
||||||
|
hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
VCFInfoHeaderLine targetHeaderLine = null;
|
||||||
|
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
||||||
|
if ( line instanceof VCFInfoHeaderLine ) {
|
||||||
|
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
||||||
|
if ( infoline.getName().equals(expression.fieldName) ) {
|
||||||
|
targetHeaderLine = infoline;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( targetHeaderLine != null ) {
|
||||||
|
if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER )
|
||||||
|
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||||
|
else
|
||||||
|
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||||
|
} else {
|
||||||
|
hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
engine.invokeAnnotationInitializationMethods(hInfo);
|
engine.invokeAnnotationInitializationMethods(hInfo);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -50,20 +50,20 @@ public class VariantAnnotatorEngine {
|
||||||
private AnnotatorCompatibleWalker walker;
|
private AnnotatorCompatibleWalker walker;
|
||||||
private GenomeAnalysisEngine toolkit;
|
private GenomeAnalysisEngine toolkit;
|
||||||
|
|
||||||
private static class VAExpression {
|
protected static class VAExpression {
|
||||||
|
|
||||||
public String fullName, fieldName;
|
public String fullName, fieldName;
|
||||||
public RodBinding<VariantContext> binding;
|
public RodBinding<VariantContext> binding;
|
||||||
|
|
||||||
public VAExpression(String fullEpression, List<RodBinding<VariantContext>> bindings) {
|
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
|
||||||
int indexOfDot = fullEpression.lastIndexOf(".");
|
int indexOfDot = fullExpression.lastIndexOf(".");
|
||||||
if ( indexOfDot == -1 )
|
if ( indexOfDot == -1 )
|
||||||
throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format");
|
throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
|
||||||
|
|
||||||
fullName = fullEpression;
|
fullName = fullExpression;
|
||||||
fieldName = fullEpression.substring(indexOfDot+1);
|
fieldName = fullExpression.substring(indexOfDot+1);
|
||||||
|
|
||||||
String bindingName = fullEpression.substring(0, indexOfDot);
|
String bindingName = fullExpression.substring(0, indexOfDot);
|
||||||
for ( RodBinding<VariantContext> rod : bindings ) {
|
for ( RodBinding<VariantContext> rod : bindings ) {
|
||||||
if ( rod.getName().equals(bindingName) ) {
|
if ( rod.getName().equals(bindingName) ) {
|
||||||
binding = rod;
|
binding = rod;
|
||||||
|
|
@ -98,6 +98,8 @@ public class VariantAnnotatorEngine {
|
||||||
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
||||||
|
|
||||||
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
||||||
AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse);
|
AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||||
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse);
|
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||||
|
|
@ -212,10 +214,15 @@ public class VariantAnnotatorEngine {
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
VariantContext vc = VCs.iterator().next();
|
VariantContext vc = VCs.iterator().next();
|
||||||
if ( vc.hasAttribute(expression.fieldName) )
|
// special-case the ID field
|
||||||
|
if ( expression.fieldName.equals("ID") ) {
|
||||||
|
if ( vc.hasID() )
|
||||||
|
infoAnnotations.put(expression.fullName, vc.getID());
|
||||||
|
} else if ( vc.hasAttribute(expression.fieldName) ) {
|
||||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private GenotypeCollection annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
private GenotypeCollection annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
if ( requestedGenotypeAnnotations.size() == 0 )
|
if ( requestedGenotypeAnnotations.size() == 0 )
|
||||||
|
|
|
||||||
|
|
@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.NGSPlatform;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,7 @@ import org.broadinstitute.sting.alignment.Alignment;
|
||||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.*;
|
||||||
import org.broadinstitute.sting.commandline.Input;
|
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
|
||||||
import org.broadinstitute.sting.commandline.RodBinding;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
|
@ -126,6 +123,11 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
|
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
|
||||||
boolean doNotUseBWA = false;
|
boolean doNotUseBWA = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false)
|
||||||
|
boolean sequenomOutput = false;
|
||||||
|
|
||||||
|
|
||||||
GenomeLoc prevInterval;
|
GenomeLoc prevInterval;
|
||||||
GenomeLoc allelePos;
|
GenomeLoc allelePos;
|
||||||
String probeName;
|
String probeName;
|
||||||
|
|
@ -485,6 +487,13 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
|
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
|
||||||
|
|
||||||
|
if (!sequenomOutput)
|
||||||
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
|
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
|
||||||
|
else {
|
||||||
|
seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
|
||||||
|
probeName = probeName.replace("amplicon_","a");
|
||||||
|
out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,7 @@ public class GaussianMixtureModel {
|
||||||
private final double[] empiricalMu;
|
private final double[] empiricalMu;
|
||||||
private final Matrix empiricalSigma;
|
private final Matrix empiricalSigma;
|
||||||
public boolean isModelReadyForEvaluation;
|
public boolean isModelReadyForEvaluation;
|
||||||
|
public boolean failedToConverge = false;
|
||||||
|
|
||||||
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
|
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
|
||||||
final double shrinkage, final double dirichletParameter, final double priorCounts ) {
|
final double shrinkage, final double dirichletParameter, final double priorCounts ) {
|
||||||
|
|
|
||||||
|
|
@ -309,8 +309,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||||
|
|
||||||
// Generate the negative model using the worst performing data and evaluate each variant contrastively
|
// Generate the negative model using the worst performing data and evaluate each variant contrastively
|
||||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
|
final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS );
|
||||||
|
GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData );
|
||||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||||
|
|
||||||
|
// Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again
|
||||||
|
while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) {
|
||||||
|
logger.info("Negative model failed to converge. Retrying...");
|
||||||
|
VRAC.MAX_GAUSSIANS--;
|
||||||
|
badModel = engine.generateModel( negativeTrainingData );
|
||||||
|
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||||
|
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||||
|
}
|
||||||
|
|
||||||
|
if( badModel.failedToConverge || goodModel.failedToConverge ) {
|
||||||
|
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
|
||||||
|
}
|
||||||
|
|
||||||
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
||||||
|
|
||||||
// Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user
|
// Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user
|
||||||
|
|
|
||||||
|
|
@ -67,14 +67,20 @@ public class VariantRecalibratorEngine {
|
||||||
|
|
||||||
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
|
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
|
||||||
if( !model.isModelReadyForEvaluation ) {
|
if( !model.isModelReadyForEvaluation ) {
|
||||||
|
try {
|
||||||
model.precomputeDenominatorForEvaluation();
|
model.precomputeDenominatorForEvaluation();
|
||||||
|
} catch( Exception e ) {
|
||||||
|
model.failedToConverge = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Evaluating full set of " + data.size() + " variants...");
|
logger.info("Evaluating full set of " + data.size() + " variants...");
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
final double thisLod = evaluateDatum( datum, model );
|
final double thisLod = evaluateDatum( datum, model );
|
||||||
if( Double.isNaN(thisLod) ) {
|
if( Double.isNaN(thisLod) ) {
|
||||||
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
|
model.failedToConverge = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
datum.lod = ( evaluateContrastively ?
|
datum.lod = ( evaluateContrastively ?
|
||||||
|
|
|
||||||
|
|
@ -95,11 +95,12 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
public boolean isReducedRead() {
|
public boolean isReducedRead() {
|
||||||
return ((GATKSAMRecord)read).isReducedRead();
|
return read.isReducedRead();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRepresentativeCount() {
|
public int getRepresentativeCount() {
|
||||||
return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1;
|
// TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed
|
||||||
|
return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -243,7 +243,7 @@ public class ReadUtils {
|
||||||
public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) {
|
public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) {
|
||||||
|
|
||||||
Pair<Integer, Integer> adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength);
|
Pair<Integer, Integer> adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength);
|
||||||
GATKSAMRecord result = (GATKSAMRecord)read;
|
GATKSAMRecord result = read;
|
||||||
|
|
||||||
if ( adaptorBoundaries != null ) {
|
if ( adaptorBoundaries != null ) {
|
||||||
if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() )
|
if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() )
|
||||||
|
|
|
||||||
|
|
@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
||||||
public void testUsingExpressionWithID() {
|
public void testUsingExpressionWithID() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
|
baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
|
||||||
Arrays.asList("4a6f0675242f685e9072c1da5ad9e715"));
|
Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5"));
|
||||||
executeTest("using expression with ID", spec);
|
executeTest("using expression with ID", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue