From 367bbee25a6bd73248a9aa2834c5c1fb5e625e83 Mon Sep 17 00:00:00 2001
From: Khalid Shakir
- * A filtered VCF. - *
- * *
* java -Xmx2g -jar GenomeAnalysisTK.jar \
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 699133e38..1c65102ae 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -1085,14 +1085,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
}
public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
- // don't validate if we're an insertion or complex event
- if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
- throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
+ // don't validate if we're a complex event
+ if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) {
+ throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
}
// we also need to validate the padding base for simple indels
- if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) )
- throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue()));
+ if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) {
+ throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue()));
+ }
}
public void validateRSIDs(Set rsIDs) {
From 354529bff37626c5f46de89a60987d3d3fd40aec Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Fri, 9 Sep 2011 13:15:24 -0400
Subject: [PATCH 08/49] adding Validate Variants integration test with a
deletion
---
.../ValidateVariantsIntegrationTest.java | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
index adf3b21a8..3d41be1ae 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
@@ -113,4 +113,16 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
executeTest("test bad alt allele", spec);
}
+
+ @Test
+ public void testBadAllele2() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ baseTestString("validationExampleBad3.vcf", "ALLELES"),
+ 0,
+ UserException.MalformedFile.class
+ );
+
+ executeTest("test bad alt allele", spec);
+ }
+
}
From 1953edcd2d1a2292117cb07ff4bb0e48e2605f0e Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Fri, 9 Sep 2011 13:39:08 -0400
Subject: [PATCH 09/49] updating Validate Variants deletion integration test
---
.../walkers/variantutils/ValidateVariantsIntegrationTest.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
index 3d41be1ae..5f71f82fd 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
@@ -117,12 +117,12 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test
public void testBadAllele2() {
WalkerTestSpec spec = new WalkerTestSpec(
- baseTestString("validationExampleBad3.vcf", "ALLELES"),
+ baseTestString("validationExampleBad3.vcf", "REF"),
0,
UserException.MalformedFile.class
);
- executeTest("test bad alt allele", spec);
+ executeTest("test bad ref allele in deletion", spec);
}
}
From 7f9000382e4d9ef30e25739401b72e7d1a7c2fcc Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro
Date: Fri, 9 Sep 2011 14:09:11 -0400
Subject: [PATCH 10/49] Making indel calls default in the MDCP
You can turn off indel calling by using -noIndels.
---
.../queue/qscripts/MethodsDevelopmentCallingPipeline.scala | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
index 80bfe03d1..17d614290 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@@ -22,8 +22,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
var noBAQ: Boolean = false
- @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false)
- var callIndels: Boolean = false
+ @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false)
+ var noIndels: Boolean = false
@Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false)
var LOCAL_ET: Boolean = false
@@ -165,7 +165,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
val goldStandard = true
for (target <- targets) {
if( !skipCalling ) {
- if (callIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target))
+ if (!noIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target))
add(new snpCall(target))
add(new VQSR(target, !goldStandard))
add(new applyVQSR(target, !goldStandard))
From 9e650dfc17621a76e421184ae33b03c2515f193c Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro
Date: Fri, 9 Sep 2011 16:25:31 -0400
Subject: [PATCH 11/49] Fixing SelectVariants documentation
getting rid of messages telling users to go for the YAML file. The idea is to not support these anymore.
---
.../gatk/walkers/variantutils/SelectVariants.java | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 35ff66243..018c4dcc2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -145,10 +145,9 @@ import java.util.*;
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
- * -o output.vcf \
- * -SM family.yaml \
* -family NA12891+NA12892=NA12878 \
- * -mvq 50
+ * -mvq 50 \
+ * -o violations.vcf
*
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
@@ -265,17 +264,17 @@ public class SelectVariants extends RodWalker {
private File AF_FILE = new File("");
@Hidden
- @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+ @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
private File FAMILY_STRUCTURE_FILE = null;
/**
* String formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
- @Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
+ @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
/**
- * Sample metadata information will be taken from a YAML file (see the -SM argument).
+ * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@@ -306,7 +305,7 @@ public class SelectVariants extends RodWalker {
@Hidden
- @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+ @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */
From a807205fc3966855a4ab122ac4aa0548e425178b Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Fri, 9 Sep 2011 18:00:23 -0400
Subject: [PATCH 12/49] a) Minor optimization to softMax() computation to avoid
redundant operations, results in about 5-10% increase in speed in indel
calling. b) Added (but left commented out since it may affect integration
tests and to isolate commits) fix to per-sample DP reporting, so that
deletions are included in count. c) Bug fix to avoid having non-reference
genotypes assigned to samples with PL=0,0,0. Correct behavior should be to
no-call these samples, and to ignore these samples when computing AC
distribution since their likelihoods are not informative.
---
.../genotyper/ExactAFCalculationModel.java | 84 +++++++++++++------
...elGenotypeLikelihoodsCalculationModel.java | 20 ++++-
.../broadinstitute/sting/utils/MathUtils.java | 54 +++++-------
3 files changed, 95 insertions(+), 63 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
index cd006a3cf..6ae437b27 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
@@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private boolean SIMPLE_GREEDY_GENOTYPER = false;
-
+ private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
- private static final double[][] getGLs(Map GLs) {
- double[][] genotypeLikelihoods = new double[GLs.size()+1][];
+ private static final ArrayList getGLs(Map GLs) {
+ ArrayList genotypeLikelihoods = new ArrayList();
- int j = 0;
+ //int j = 0;
+ genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) {
- j++;
-
if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
- genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
+ double[] gls = sample.getLikelihoods().getAsVector();
+
+ if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
+ genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
+
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
@@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
- final int numSamples = GLs.size();
+ final ArrayList genotypeLikelihoods = getGLs(GLs);
+ final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
- final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
@@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
- kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
+ kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
- final double[] gl = genotypeLikelihoods[j];
+ final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
@@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
- boolean multiAllelicRecord = false;
-
- if (vc.getAlternateAlleles().size() > 1)
- multiAllelicRecord = true;
Map GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
- if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
+ if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size();
}
@@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
continue;
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
+
+ if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
+ //System.out.print(sample.getKey()+":");
+ //for (int k=0; k < likelihoods.length; k++)
+ // System.out.format("%4.2f ",likelihoods[k]);
+ //System.out.println();
+ // all likelihoods are essentially the same: skip this sample and will later on force no call.
+ //sampleIdx++;
+ continue;
+ }
+
sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) {
@@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
-
- if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
- bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
- else {
- int newIdx = tracebackArray[k][startIdx];
- bestGTguess = startIdx - newIdx;
- startIdx = newIdx;
- }
-
+ // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
+ // and will add no-call genotype to GL's in a second pass
ArrayList myAlleles = new ArrayList();
double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector();
+
+ if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
+ bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
+ }
+ else {
+ int newIdx = tracebackArray[k][startIdx];;
+ bestGTguess = startIdx - newIdx;
+ startIdx = newIdx;
+ }
+
/* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++)
- System.out.format("%1.4f ",likelihoods[i]);
+ System.out.format("%1.4f, ",likelihoods[i]);
*/
for (int i=0; i < likelihoods.length; i++) {
@@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
+ for ( Map.Entry sample : GLs.entrySet() ) {
+
+ if ( !sample.getValue().hasLikelihoods() )
+ continue;
+ Genotype g = GLs.get(sample.getKey());
+
+ double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
+
+ if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
+ continue; // regular likelihoods
+
+ ArrayList myAlleles = new ArrayList();
+
+ double qual = Genotype.NO_NEG_LOG_10PERROR;
+ myAlleles.add(Allele.NO_CALL);
+ myAlleles.add(Allele.NO_CALL);
+ //System.out.println(myAlleles.toString());
+ calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
+ }
return calls;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 07f02de57..2a99f1aad 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
@@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) {
double[] genotypeLikelihoods;
+
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
-
- // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
- // for 3 alleles it's 00 01 11 02 12 22
- GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
+ GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
@@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
+ // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
+ // so that per-sample DP will include deletions covering the event.
+ protected int getFilteredDepth(ReadBackedPileup pileup) {
+ int count = 0;
+ for ( PileupElement p : pileup ) {
+ if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
+ count++;
+ }
+
+ return count;
+ }
+
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
index f4c057c15..0d85f9606 100644
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -1056,42 +1056,30 @@ public class MathUtils {
}
static public double softMax(final double x, final double y) {
- if (Double.isInfinite(x))
- return y;
+ // we need to compute log10(10^x + 10^y)
+ // By Jacobian logarithm identity, this is equal to
+ // max(x,y) + log10(1+10^-abs(x-y))
+ // we compute the second term as a table lookup
+ // with integer quantization
- if (Double.isInfinite(y))
- return x;
+ // slow exact version:
+ // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
- if (y >= x + MAX_JACOBIAN_TOLERANCE)
- return y;
- if (x >= y + MAX_JACOBIAN_TOLERANCE)
- return x;
+ double diff = x-y;
- // OK, so |y-x| < tol: we use the following identity then:
- // we need to compute log10(10^x + 10^y)
- // By Jacobian logarithm identity, this is equal to
- // max(x,y) + log10(1+10^-abs(x-y))
- // we compute the second term as a table lookup
- // with integer quantization
-
- //double diff = Math.abs(x-y);
- double diff = x-y;
- double t1 =x;
- if (diff<0) { //
- t1 = y;
- diff= -diff;
- }
- // t has max(x,y), diff has abs(x-y)
- // we have pre-stored correction for 0,0.1,0.2,... 10.0
- //int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
- int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
- // gdebug+
- //double z =Math.log10(1+Math.pow(10.0,-diff));
- //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
- //gdebug-
- return t1+jacobianLogTable[ind];
- // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
- }
+ if (diff > MAX_JACOBIAN_TOLERANCE)
+ return x;
+ else if (diff < -MAX_JACOBIAN_TOLERANCE)
+ return y;
+ else if (diff >= 0) {
+ int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
+ return x + jacobianLogTable[ind];
+ }
+ else {
+ int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
+ return y + jacobianLogTable[ind];
+ }
+ }
public static double phredScaleToProbability (byte q) {
return Math.pow(10,(-q)/10.0);
From b399424a9cd4c842e8dac0e2e0f9c17ba4002ff4 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Fri, 9 Sep 2011 20:44:47 -0400
Subject: [PATCH 13/49] Fix integration test affected by non-calling all-zero
PL samples, and add a more complicated multi-sample integration test from a
phase 1 case, GBR with mixed technologies and complex input alleles
---
.../genotyper/UnifiedGenotyperIntegrationTest.java | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index 185880401..e212e07ea 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -18,6 +18,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b36dbSNP129;
+ private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
//
@@ -28,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
- Arrays.asList("149e6ad9b3fd23551254a691286a96b3"));
+ Arrays.asList("4bd3e874d071c4df250dce32cf441aab"));
executeTest("test MultiSample Pilot1", spec);
}
@@ -276,7 +277,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
Arrays.asList("e66b7321e2ac91742ad3ef91040daafd"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
+ WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
+ baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
+ "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
+ Arrays.asList("b6c3f771e8844a64681187ebb2b620f1"));
+ executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
+
}
+
}
From 9344938360d6d7d1312dc5993023fedd4f540701 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Sat, 10 Sep 2011 19:41:01 -0400
Subject: [PATCH 14/49] Uncomment code to add deleted bases covering an indel
to per-sample genotype reporting, update integration tests accordingly
---
...elGenotypeLikelihoodsCalculationModel.java | 2 +-
.../UnifiedGenotyperIntegrationTest.java | 24 +++++++++----------
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 2a99f1aad..ec5eefd60 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -449,7 +449,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
- if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
+ if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
count++;
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index e212e07ea..41496bdf1 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
- Arrays.asList("4bd3e874d071c4df250dce32cf441aab"));
+ Arrays.asList("e6639ea2dc81635c706e6c35921406d7"));
executeTest("test MultiSample Pilot1", spec);
}
@@ -50,7 +50,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
- Arrays.asList("82d469145c174486ccc494884852cc58"));
+ Arrays.asList("d1cbd1fb9f3f7323941a95bc2def7e5a"));
executeTest("test SingleSample Pilot2", spec);
}
@@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
- private final static String COMPRESSED_OUTPUT_MD5 = "a5a9f38c645d6004d4640765a8b77ce4";
+ private final static String COMPRESSED_OUTPUT_MD5 = "2732b169cdccb21eb3ea00429619de79";
@Test
public void testCompressedOutput() {
@@ -81,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
- String md5 = "0a45761c0e557d9c2080eb9e7f4f6c41";
+ String md5 = "cbac3960bbcb9d6192c57549208c182c";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@@ -160,8 +160,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap e = new HashMap();
- e.put( 0.01, "af5199fbc0853cf5888acdcc88f012bc" );
- e.put( 1.0 / 1850, "4e6938645ccde1fdf204ffbf4e88170f" );
+ e.put( 0.01, "aed69402ddffe7f2ed5ca98563bfba02" );
+ e.put( 1.0 / 1850, "fa94a059f08c1821b721335d93ed2ea5" );
for ( Map.Entry entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@@ -185,7 +185,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("213ebaaaacf850312d885e918eb33500"));
+ Arrays.asList("1c080e6596d4c830bb5d147b04e2a82c"));
executeTest(String.format("test multiple technologies"), spec);
}
@@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
- Arrays.asList("3aecba34a89f3525afa57a38dc20e6cd"));
+ Arrays.asList("9129ad748ca3be2d3b321d2d7e83ae5b"));
executeTest(String.format("test calling with BAQ"), spec);
}
@@ -223,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("043973c719a85de29a35a33a674616fb"));
+ Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@@ -238,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("68d4e6c1849e892467aed61c33e7bf24"));
+ Arrays.asList("5fe98ee853586dc9db58f0bc97daea63"));
executeTest(String.format("test indel caller in SLX witn low min allele count"), spec);
}
@@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("f86d453c5d2d2f33fb28ae2050658a5e"));
+ Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
- Arrays.asList("b6c3f771e8844a64681187ebb2b620f1"));
+ Arrays.asList("4be308fd9e8167ebee677f62a7a753b7"));
executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
}
From 07d365ce392bc5c38ad1f3dfc7348819b7017438 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Mon, 12 Sep 2011 09:01:34 -0400
Subject: [PATCH 17/49] Fixing units in queue job report Gantt plots
---
public/R/queueJobReport.R | 2 ++
1 file changed, 2 insertions(+)
diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R
index a24d269c9..31916361e 100644
--- a/public/R/queueJobReport.R
+++ b/public/R/queueJobReport.R
@@ -140,6 +140,8 @@ print(paste("Project :", inputFileName))
convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
+ g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
+ g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
g
}
lapply(gatkReportData, convertGroup)
From 60ebe68aff12290f18527faed04f3f7bb356d962 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Mon, 12 Sep 2011 09:43:23 -0400
Subject: [PATCH 18/49] Fixing issue in VariantEval in which insertion and
deletion events weren't treated symmetrically. Added new option to require
strict allele matching.
---
.../gatk/walkers/varianteval/VariantEvalWalker.java | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 0d09b7033..266b97af0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -55,7 +55,7 @@ import java.util.*;
*
* Output
*
- * Evaluation tables.
+ * Evaluation tables detailing the results of the eval modules which were applied.
*
*
* Examples
@@ -152,6 +152,9 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
private File ancestralAlignmentsFile = null;
+ @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
+ private boolean requireStrictAlleleMatch = false;
+
// Variables
private Set jexlExpressions = new TreeSet();
@@ -360,16 +363,16 @@ public class VariantEvalWalker extends RodWalker implements Tr
if ( matchingComps.size() == 0 )
return null;
- // find the comp which matches the alternate allele from eval
+ // find the comp which matches both the reference allele and alternate allele from eval
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
for ( VariantContext comp : matchingComps ) {
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
- if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
+ if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
return comp;
}
- // if none match, just return the first one
- return matchingComps.get(0);
+ // if none match, just return the first one unless we require a strict match
+ return (requireStrictAlleleMatch ? null : matchingComps.get(0));
}
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }
From 981b78ea50708139cc3157ccff990fca6cb3e7e8 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Mon, 12 Sep 2011 12:17:43 -0400
Subject: [PATCH 19/49] Changing the VQSR command line syntax back to the
parsed tags approach. This cleans up the code and makes sure we won't be
parsing the same rod file multiple times. I've tried to update the
appropriate qscripts.
---
.../variantrecalibration/TrainingSet.java | 76 ++++++++++++++++
.../VariantDataManager.java | 88 +++++++++----------
.../VariantRecalibrator.java | 66 ++++----------
...ntRecalibrationWalkersIntegrationTest.java | 13 ++-
.../MethodsDevelopmentCallingPipeline.scala | 10 +--
5 files changed, 147 insertions(+), 106 deletions(-)
create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java
new file mode 100755
index 000000000..5f688d001
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.commandline.Tags;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 3/12/11
+ */
+
+public class TrainingSet {
+
+ public RodBinding rodBinding;
+ public boolean isKnown = false;
+ public boolean isTraining = false;
+ public boolean isAntiTraining = false;
+ public boolean isTruth = false;
+ public boolean isConsensus = false;
+ public double prior = 0.0;
+
+ protected final static Logger logger = Logger.getLogger(TrainingSet.class);
+
+ public TrainingSet( final RodBinding rodBinding) {
+ this.rodBinding = rodBinding;
+
+ final Tags tags = rodBinding.getTags();
+ final String name = rodBinding.getName();
+
+ // Parse the tags to decide which tracks have which properties
+ if( tags != null ) {
+ isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
+ isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
+ isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
+ isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
+ isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
+ prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
+ }
+
+ // Report back to the user which tracks were found and the properties that were detected
+ if( !isConsensus && !isAntiTraining ) {
+ logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
+ } else if( isConsensus ) {
+ logger.info( String.format( "Found consensus track: %s", name) );
+ } else {
+ logger.info( String.format( "Found bad sites training track: %s", name) );
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
index 429becfc7..e04bfab76 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@@ -51,10 +51,10 @@ public class VariantDataManager {
private ExpandingArrayList data;
private final double[] meanVector;
private final double[] varianceVector; // this is really the standard deviation
- public final ArrayList annotationKeys;
+ public final List annotationKeys;
private final VariantRecalibratorArgumentCollection VRAC;
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
-
+ protected final List trainingSets;
public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
this.data = null;
@@ -62,6 +62,7 @@ public class VariantDataManager {
this.VRAC = VRAC;
meanVector = new double[this.annotationKeys.size()];
varianceVector = new double[this.annotationKeys.size()];
+ trainingSets = new ArrayList();
}
public void setData( final ExpandingArrayList data ) {
@@ -104,6 +105,31 @@ public class VariantDataManager {
}
}
+ public void addTrainingSet( final TrainingSet trainingSet ) {
+ trainingSets.add( trainingSet );
+ }
+
+ public boolean checkHasTrainingSet() {
+ for( final TrainingSet trainingSet : trainingSets ) {
+ if( trainingSet.isTraining ) { return true; }
+ }
+ return false;
+ }
+
+ public boolean checkHasTruthSet() {
+ for( final TrainingSet trainingSet : trainingSets ) {
+ if( trainingSet.isTruth ) { return true; }
+ }
+ return false;
+ }
+
+ public boolean checkHasKnownSet() {
+ for( final TrainingSet trainingSet : trainingSets ) {
+ if( trainingSet.isKnown ) { return true; }
+ }
+ return false;
+ }
+
public ExpandingArrayList getTrainingData() {
final ExpandingArrayList trainingData = new ExpandingArrayList();
for( final VariantDatum datum : data ) {
@@ -232,57 +258,35 @@ public class VariantDataManager {
return value;
}
- public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap,
- final List> training, final List> truth, final List> known, final List> badSites, final List> resource) {
+ public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
datum.isKnown = false;
datum.atTruthSite = false;
datum.atTrainingSite = false;
datum.atAntiTrainingSite = false;
datum.prior = 2.0;
- //BUGBUG: need to clean this up
-
- for( final RodBinding rod : training ) {
- for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
+ for( final TrainingSet trainingSet : trainingSets ) {
+ for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
- datum.atTrainingSite = true;
- datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
+ datum.isKnown = datum.isKnown || trainingSet.isKnown;
+ datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
+ datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
+ datum.prior = Math.max( datum.prior, trainingSet.prior );
+ datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
}
- }
- }
- for( final RodBinding rod : truth ) {
- for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
- if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
- datum.atTruthSite = true;
- datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
- }
- }
- }
- for( final RodBinding rod : known ) {
- for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
- if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
- datum.isKnown = true;
- datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
- }
- }
- }
- for( final RodBinding rod : resource ) {
- for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
- if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
- datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
- }
- }
- }
- for( final RodBinding rod : badSites ) {
- for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( trainVC != null ) {
- datum.atAntiTrainingSite = true;
- datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
+ datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
}
}
}
}
+ private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
+ return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
+ ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
+ (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
+ }
+
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
@@ -290,10 +294,4 @@ public class VariantDataManager {
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
-
- private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
- return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
- ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
- (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
- }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
index df4faebd1..529d17285 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@@ -77,16 +77,15 @@ import java.util.*;
*
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
- *
Examples
+ * Example
*
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- * -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- * -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- * -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- * -known:prior=8.0 dbsnp_132.b37.vcf \
+ * -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ * -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ * -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
@@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker> input;
/**
- * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
- */
- @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
- public List> training;
-
- /**
- * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
- * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
- */
- @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
- public List> truth;
-
- /**
- * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
- * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
- */
- @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
- public List> known = Collections.emptyList();
-
- /**
- * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
- * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
- */
- @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
- public List> badSites = Collections.emptyList();
-
- /**
- * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+ * Any set of VCF files to use as lists of training, truth, or known sites.
+ * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+ * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+ * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+ * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
*/
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
public List> resource = Collections.emptyList();
@@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
- private final HashMap rodToPriorMap = new HashMap();
//---------------------------------------------------------------------------------------------------------------
//
@@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker> allInputBindings = new ArrayList>();
- allInputBindings.addAll(truth);
- allInputBindings.addAll(training);
- allInputBindings.addAll(known);
- allInputBindings.addAll(badSites);
- allInputBindings.addAll(resource);
- for( final RodBinding rod : allInputBindings ) {
- try {
- rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
- } catch( NumberFormatException e ) {
- throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
- }
+ for( RodBinding rod : resource ) {
+ dataManager.addTrainingSet( new TrainingSet( rod ) );
+ }
+
+ if( !dataManager.checkHasTrainingSet() ) {
+ throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+ }
+ if( !dataManager.checkHasTruthSet() ) {
+ throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
}
@@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker= 10) {
From 9d9d438bc4b1804631126565ad15f4a69f649b0b Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Mon, 12 Sep 2011 12:28:23 -0400
Subject: [PATCH 20/49] New VariantAnnotatorEngine capability: an initialize()
method for all annotation classes.
All VariantAnnotator annotation classes may now have an (optional) initialize() method
that gets called by the VariantAnnotatorEngine ONCE before annotation starts.
As an example of how this can be used, the SnpEff annotation class will use the initialize()
method to check whether the SnpEff version number stored in the vcf header is a supported
version, and also to verify that its required RodBinding is present.
---
.../gatk/walkers/annotator/VariantAnnotator.java | 3 +++
.../walkers/annotator/VariantAnnotatorEngine.java | 15 +++++++++++----
.../interfaces/AnnotatorCompatibleWalker.java | 1 +
.../interfaces/VariantAnnotatorAnnotation.java | 9 +++------
.../gatk/walkers/genotyper/UnifiedGenotyper.java | 1 +
5 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 96a400c68..971727727 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -86,6 +86,7 @@ public class VariantAnnotator extends RodWalker implements Ann
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+ public RodBinding getVariantRodBinding() { return variantCollection.variants; }
/**
* The INFO field will be annotated with information on the most biologically-significant effect
@@ -208,6 +209,8 @@ public class VariantAnnotator extends RodWalker implements Ann
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
engine.initializeExpressions(expressionsToUse);
+ engine.invokeAnnotationInitializationMethods();
+
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
Set hInfo = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 01926a7f3..17830f129 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,10 +29,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@@ -113,6 +110,16 @@ public class VariantAnnotatorEngine {
dbAnnotations.put(rod, rod.getName());
}
+ public void invokeAnnotationInitializationMethods() {
+ for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
+ annotation.initialize(walker);
+ }
+
+ for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
+ annotation.initialize(walker);
+ }
+ }
+
public Set getVCFAnnotationDescriptions() {
Set descriptions = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
index 20a2aea0e..9dda57ae3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
@@ -9,6 +9,7 @@ import java.util.List;
public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
+ public abstract RodBinding getVariantRodBinding();
public abstract RodBinding getSnpEffRodBinding();
public abstract RodBinding getDbsnpRodBinding();
public abstract List> getCompRodBindings();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
index f33d61df9..9e48de9c3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
@@ -24,18 +24,15 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
-import java.util.Map;
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
public abstract class VariantAnnotatorAnnotation {
// return the INFO keys
public abstract List getKeyNames();
+
+ // initialization method (optional for subclasses, and therefore non-abstract)
+ public void initialize ( AnnotatorCompatibleWalker walker ) { }
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index d5dbdedd6..4ee2d5f44 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -127,6 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
+ public RodBinding getVariantRodBinding() { return null; }
public RodBinding getSnpEffRodBinding() { return null; }
public List> getCompRodBindings() { return Collections.emptyList(); }
public List> getResourceRodBindings() { return Collections.emptyList(); }
From ec4b30de6d8e9634ca0014c65215460bda066b64 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 12 Sep 2011 14:45:53 -0400
Subject: [PATCH 21/49] Patch from Laurent: typo leads to bad error messages.
---
.../sting/commandline/ArgumentTypeDescriptor.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index 16358d05f..5fff8f609 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
- if ( ! file.canRead() | !! file.isFile() ) {
+ if ( ! file.canRead() | ! file.isFile() ) {
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
} else {
throw new UserException.CommandLineException(
From 4e116760f4f8577bfaf677ed7711cfc599a6c90d Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 12 Sep 2011 15:09:25 -0400
Subject: [PATCH 22/49] Removing some old cruft from the packages dir.
Updating AnalyzeCovariates to include all Covariates.
---
public/packages/AnalyzeCovariates.xml | 5 +---
.../packages/FindContaminatingReadGroups.xml | 10 -------
public/packages/GATKResources.xml | 20 --------------
public/packages/IndelGenotyper.xml | 11 --------
.../packages/LocalRealignmentAroundIndels.xml | 12 ---------
.../packages/QualityScoresRecalibration.xml | 18 -------------
public/packages/RMDIndexer.xml | 13 ----------
public/packages/UnifiedGenotyper.xml | 11 --------
public/packages/VariantAnnotator.xml | 26 -------------------
public/packages/VariantEval.xml | 18 -------------
public/packages/VariantFiltration.xml | 13 ----------
public/packages/VariantRecalibration.xml | 12 ---------
12 files changed, 1 insertion(+), 168 deletions(-)
delete mode 100644 public/packages/FindContaminatingReadGroups.xml
delete mode 100755 public/packages/GATKResources.xml
delete mode 100644 public/packages/IndelGenotyper.xml
delete mode 100644 public/packages/LocalRealignmentAroundIndels.xml
delete mode 100644 public/packages/QualityScoresRecalibration.xml
delete mode 100644 public/packages/RMDIndexer.xml
delete mode 100644 public/packages/UnifiedGenotyper.xml
delete mode 100644 public/packages/VariantAnnotator.xml
delete mode 100644 public/packages/VariantEval.xml
delete mode 100644 public/packages/VariantFiltration.xml
delete mode 100644 public/packages/VariantRecalibration.xml
diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml
index 7e31934df..a6675a63d 100644
--- a/public/packages/AnalyzeCovariates.xml
+++ b/public/packages/AnalyzeCovariates.xml
@@ -6,10 +6,7 @@
-
-
-
-
+
diff --git a/public/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml
deleted file mode 100644
index 880f64a81..000000000
--- a/public/packages/FindContaminatingReadGroups.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/GATKResources.xml b/public/packages/GATKResources.xml
deleted file mode 100755
index 87e6e0e50..000000000
--- a/public/packages/GATKResources.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml
deleted file mode 100644
index c9e3ae0f6..000000000
--- a/public/packages/IndelGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml
deleted file mode 100644
index 46960e69f..000000000
--- a/public/packages/LocalRealignmentAroundIndels.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml
deleted file mode 100644
index 95e8b7c63..000000000
--- a/public/packages/QualityScoresRecalibration.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml
deleted file mode 100644
index 5d40876de..000000000
--- a/public/packages/RMDIndexer.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml
deleted file mode 100644
index 67a17640c..000000000
--- a/public/packages/UnifiedGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml
deleted file mode 100644
index 88c0701f0..000000000
--- a/public/packages/VariantAnnotator.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/VariantEval.xml b/public/packages/VariantEval.xml
deleted file mode 100644
index 791066fb7..000000000
--- a/public/packages/VariantEval.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml
deleted file mode 100644
index 48fa0ff37..000000000
--- a/public/packages/VariantFiltration.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/public/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml
deleted file mode 100644
index 6fe6b1eff..000000000
--- a/public/packages/VariantRecalibration.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
From e63d9d8f8edeaab24ad3bff06a97cc15b14b0043 Mon Sep 17 00:00:00 2001
From: Matt Hanna
Date: Mon, 12 Sep 2011 21:50:59 -0400
Subject: [PATCH 23/49] Mauricio pointed out to me that dynamic merging the
unmapped regions of multiple BAMs ('-L unmapped' with a BAM list) was
completely broken. Sorry about this! Fixed.
---
.../gatk/datasources/reads/BAMScheduler.java | 3 +-
.../gatk/datasources/reads/GATKBAMIndex.java | 39 +++++++++++++++++++
.../datasources/reads/ReadShardStrategy.java | 21 ++--------
3 files changed, 45 insertions(+), 18 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index 467aebac5..47eb55b28 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator {
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
- nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
+ nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
currentLocus = null;
continue;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
index 5d0c38b78..dc703ff23 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
@@ -215,6 +215,45 @@ public class GATKBAMIndex {
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
+ /**
+ * Use to get close to the unmapped reads at the end of a BAM file.
+ * @return The file offset of the first record in the last linear bin, or -1
+ * if there are no elements in linear bins (i.e. no mapped reads).
+ */
+ public long getStartOfLastLinearBin() {
+ openIndexFile();
+
+ seek(4);
+
+ final int sequenceCount = readInteger();
+ // Because no reads may align to the last sequence in the sequence dictionary,
+ // grab the last element of the linear index for each sequence, and return
+ // the last one from the last sequence that has one.
+ long lastLinearIndexPointer = -1;
+ for (int i = 0; i < sequenceCount; i++) {
+ // System.out.println("# Sequence TID: " + i);
+ final int nBins = readInteger();
+ // System.out.println("# nBins: " + nBins);
+ for (int j1 = 0; j1 < nBins; j1++) {
+ // Skip bin #
+ skipBytes(4);
+ final int nChunks = readInteger();
+ // Skip chunks
+ skipBytes(16 * nChunks);
+ }
+ final int nLinearBins = readInteger();
+ if (nLinearBins > 0) {
+ // Skip to last element of list of linear bins
+ skipBytes(8 * (nLinearBins - 1));
+ lastLinearIndexPointer = readLongs(1)[0];
+ }
+ }
+
+ closeIndexFile();
+
+ return lastLinearIndexPointer;
+ }
+
/**
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
index c2235ec73..5ea75dbb0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
@@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
Map selectedReaders = new HashMap();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
+
for(SAMReaderID id: shardPosition.keySet()) {
- // If the region contains location information (in other words, it is not at
- // the start of the unmapped region), add the region.
- if(currentFilePointer.isRegionUnmapped) {
- // If the region is unmapped and no location data exists, add a null as an indicator to
- // start at the next unmapped region.
- if(!isIntoUnmappedRegion) {
- selectedReaders.put(id,null);
- isIntoUnmappedRegion = true;
- }
- else
- selectedReaders.put(id,position.get(id));
- }
- else {
- SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
- if(!fileSpan.isEmpty())
- selectedReaders.put(id,fileSpan);
- }
+ SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
+ if(!fileSpan.isEmpty())
+ selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {
From c6672f2397fd55de79caa05420fc0ff8201d0e4d Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Tue, 13 Sep 2011 16:57:37 -0400
Subject: [PATCH 24/49] Intermediate (but necessary) fix for Beagle walkers: if
a marker is absent in the Beagle output files, but present in the input vcf,
there's no reason why it should be omitted in the output vcf. Rather, the vc
is written as is from the input vcf
---
.../beagle/BeagleOutputToVCFWalker.java | 19 +++++++------------
.../walkers/beagle/BeagleIntegrationTest.java | 4 ++--
2 files changed, 9 insertions(+), 14 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index 880dba5d0..7f6dabeec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -175,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker {
}
BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
- // ignore places where we don't have a variant
- if ( beagleR2Feature == null )
- return 0;
-
-
BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
-
- // ignore places where we don't have a variant
- if ( beagleProbsFeature == null )
- return 0;
-
BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
+
// ignore places where we don't have a variant
- if ( beaglePhasedFeature == null )
- return 0;
+ if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
+ {
+ vcfWriter.add(vc_input);
+ return 1;
+ }
+
// get reference base for current position
byte refByte = ref.getBase();
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
index 5f759fdbf..1a01ef8e8 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
@@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
- "-o %s -NO_HEADER", 1, Arrays.asList("3531451e84208264104040993889aaf4"));
+ "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847"));
executeTest("test BeagleOutputToVCF", spec);
}
@@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
- "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("8dd6ec53994fb46c5c22af8535d22965"));
+ "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee"));
executeTest("testBeagleChangesSitesToRef",spec);
}
From 1213b2f8c67bb871c418f13b16728a702c877786 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Fri, 9 Sep 2011 16:10:30 -0400
Subject: [PATCH 25/49] SnpEff 2.0.2 support
-Rewrote SnpEff support in VariantAnnotator to support the latest SnpEff release (version 2.0.2)
-Removed support for SnpEff 1.9.6 (and associated tribble codec)
-Will refuse to parse SnpEff output files produced by unsupported versions (or without a version tag)
-Correctly matches ref/alt alleles before annotating a record, unlike the previous version
-Correctly handles indels (again, unlike the previous version
---
.../sting/gatk/walkers/annotator/SnpEff.java | 482 ++++++++++++++----
.../walkers/annotator/VariantAnnotator.java | 9 +-
.../annotator/VariantAnnotatorEngine.java | 12 +-
.../interfaces/AnnotatorCompatibleWalker.java | 5 +-
.../VariantAnnotatorAnnotation.java | 3 +-
.../walkers/genotyper/UnifiedGenotyper.java | 5 +-
.../utils/codecs/snpEff/SnpEffCodec.java | 282 ----------
.../utils/codecs/snpEff/SnpEffConstants.java | 115 -----
.../utils/codecs/snpEff/SnpEffFeature.java | 423 ---------------
.../sting/utils/codecs/vcf/VCFHeader.java | 12 +
.../utils/variantcontext/VariantContext.java | 22 +
.../walkers/annotator/SnpEffUnitTest.java | 86 ++++
.../VariantAnnotatorIntegrationTest.java | 21 +-
.../codecs/snpEff/SnpEffCodecUnitTest.java | 259 ----------
14 files changed, 539 insertions(+), 1197 deletions(-)
delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java
delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java
create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
delete mode 100644 public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index 350c683c2..14abbca5b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -24,7 +24,9 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
+import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@@ -46,134 +45,421 @@ import java.util.*;
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
- * output file (which must be provided on the command line via --snpEffFile:SnpEff ),
+ * output file (which must be provided on the command line via --snpEffFile .vcf),
* and adds annotations on that effect.
*
- * The possible biological effects and their associated impacts are defined in the class:
- * org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
- *
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
- // SnpEff annotation key names:
- public static final String GENE_ID_KEY = "GENE_ID";
- public static final String GENE_NAME_KEY = "GENE_NAME";
- public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
- public static final String EXON_ID_KEY = "EXON_ID";
- public static final String EXON_RANK_KEY = "EXON_RANK";
- public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
- public static final String EFFECT_KEY = "EFFECT";
- public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
- public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
- public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
- public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
- public static final String CODON_NUM_KEY = "CODON_NUM";
- public static final String CDS_SIZE_KEY = "CDS_SIZE";
+ private static Logger logger = Logger.getLogger(SnpEff.class);
+
+ // We refuse to parse SnpEff output files generated by unsupported versions, or
+ // lacking a SnpEff version number in the VCF header:
+ public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
+ public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
+
+ // SnpEff aggregates all effects (and effect metadata) together into a single INFO
+ // field annotation with the key EFF:
+ public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
+ public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
+ public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
+
+ // Key names for the INFO field annotations we will add to each record, along
+ // with parsing-related information:
+ public enum InfoFieldKey {
+ EFF (-1),
+ EFF_IMPACT (0),
+ EFF_CODON_CHANGE (1),
+ EFF_AMINO_ACID_CHANGE (2),
+ EFF_GENE_NAME (3),
+ EFF_GENE_BIOTYPE (4),
+ EFF_TRANSCRIPT_ID (6),
+ EFF_EXON_ID (7);
+
+ // Index within the effect metadata subfields from the SnpEff EFF annotation
+ // where each key's associated value can be found during parsing.
+ private final int fieldIndex;
+
+ InfoFieldKey ( int fieldIndex ) {
+ this.fieldIndex = fieldIndex;
+ }
+
+ public int getFieldIndex() {
+ return fieldIndex;
+ }
+ }
+
+ // Possible SnpEff biological effects. All effect names found in the SnpEff input file
+ // are validated against this list.
+ public enum EffectType {
+ NONE,
+ CHROMOSOME,
+ INTERGENIC,
+ UPSTREAM,
+ UTR_5_PRIME,
+ UTR_5_DELETED,
+ START_GAINED,
+ SPLICE_SITE_ACCEPTOR,
+ SPLICE_SITE_DONOR,
+ START_LOST,
+ SYNONYMOUS_START,
+ NON_SYNONYMOUS_START,
+ CDS,
+ GENE,
+ TRANSCRIPT,
+ EXON,
+ EXON_DELETED,
+ NON_SYNONYMOUS_CODING,
+ SYNONYMOUS_CODING,
+ FRAME_SHIFT,
+ CODON_CHANGE,
+ CODON_INSERTION,
+ CODON_CHANGE_PLUS_CODON_INSERTION,
+ CODON_DELETION,
+ CODON_CHANGE_PLUS_CODON_DELETION,
+ STOP_GAINED,
+ SYNONYMOUS_STOP,
+ NON_SYNONYMOUS_STOP,
+ STOP_LOST,
+ INTRON,
+ UTR_3_PRIME,
+ UTR_3_DELETED,
+ DOWNSTREAM,
+ INTRON_CONSERVED,
+ INTERGENIC_CONSERVED,
+ REGULATION,
+ CUSTOM,
+ WITHIN_NON_CODING_GENE
+ }
+
+ // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact.
+ public enum EffectImpact {
+ LOW (1),
+ MODERATE (2),
+ HIGH (3);
+
+ private final int severityRating;
+
+ EffectImpact ( int severityRating ) {
+ this.severityRating = severityRating;
+ }
+
+ public boolean isHigherImpactThan ( EffectImpact other ) {
+ return this.severityRating > other.severityRating;
+ }
+ }
+
+ // SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
+ public enum EffectCoding {
+ CODING,
+ NON_CODING,
+ UNKNOWN
+ }
+
+
+ public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
+ validateRodBinding(walker.getSnpEffRodBinding());
+ checkSnpEffVersion(walker, toolkit);
+ }
public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) {
- RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
- validateRodBinding(snpEffRodBinding);
+ RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
- List features = tracker.getValues(snpEffRodBinding, ref.getLocus());
+ // Get only SnpEff records that start at this locus, not merely span it:
+ List snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
- // Add only annotations for one of the most biologically-significant effects as defined in
- // the SnpEffConstants class:
- SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
-
- if ( mostSignificantEffect == null ) {
+ // Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
+ // If there is more than one such record, we only need to pick the first one, since the biological
+ // effects will be the same across all such records:
+ VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
+ if ( matchingRecord == null ) {
return null;
}
- return generateAnnotations(mostSignificantEffect);
+ // Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
+ List effects = parseSnpEffRecord(matchingRecord);
+ if ( effects.size() == 0 ) {
+ return null;
+ }
+
+ // Add only annotations for one of the most biologically-significant effects from this set:
+ SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
+ return mostSignificantEffect.getAnnotations();
}
- private void validateRodBinding ( RodBinding snpEffRodBinding ) {
+ private void validateRodBinding ( RodBinding snpEffRodBinding ) {
if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
- throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
- "as a rodbinding on the command line, but no SnpEff rodbinding was found.");
+ throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
+ "as a rodbinding on the command line via the --snpEffFile option, but " +
+ "no SnpEff rodbinding was found.");
}
}
- private SnpEffFeature getMostSignificantEffect ( List snpEffFeatures ) {
- SnpEffFeature mostSignificantEffect = null;
+ private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
+ RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
- for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
+ VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
+ VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
+
+ if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
+ throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
+ "input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
+ Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
+ }
+
+ String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
+
+ if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
+ throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
+ "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
+ }
+ }
+
+ private boolean isSupportedSnpEffVersion ( String versionString ) {
+ for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
+ if ( supportedVersion.equals(versionString) ) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) {
+ for ( VariantContext snpEffRecord : snpEffRecords ) {
+ if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
+ return snpEffRecord;
+ }
+ }
+
+ return null;
+ }
+
+ private List parseSnpEffRecord ( VariantContext snpEffRecord ) {
+ List parsedEffects = new ArrayList();
+
+ Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
+ List individualEffects;
+
+ // The VCF codec stores multi-valued fields as a List, and single-valued fields as a String.
+ // We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
+ if ( effectFieldValue instanceof List ) {
+ individualEffects = (List)effectFieldValue;
+ }
+ else {
+ individualEffects = Arrays.asList((String)effectFieldValue);
+ }
+
+ for ( String effectString : individualEffects ) {
+ String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
+
+ if ( effectNameAndMetadata.length != 2 ) {
+ logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
+ snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
+ continue;
+ }
+
+ String effectName = effectNameAndMetadata[0];
+ String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
+
+ SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
+
+ if ( parsedEffect.isWellFormed() ) {
+ parsedEffects.add(parsedEffect);
+ }
+ else {
+ logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
+ snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
+ }
+ }
+
+ return parsedEffects;
+ }
+
+ private SnpEffEffect getMostSignificantEffect ( List effects ) {
+ SnpEffEffect mostSignificantEffect = null;
+
+ for ( SnpEffEffect effect : effects ) {
if ( mostSignificantEffect == null ||
- snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
+ effect.isHigherImpactThan(mostSignificantEffect) ) {
- mostSignificantEffect = snpEffFeature;
+ mostSignificantEffect = effect;
}
}
return mostSignificantEffect;
}
- private Map generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
- Map annotations = new LinkedHashMap(Utils.optimumHashSize(getKeyNames().size()));
-
- if ( mostSignificantEffect.hasGeneID() )
- annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
- if ( mostSignificantEffect.hasGeneName() )
- annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
- if ( mostSignificantEffect.hasTranscriptID() )
- annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
- if ( mostSignificantEffect.hasExonID() )
- annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
- if ( mostSignificantEffect.hasExonRank() )
- annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
- if ( mostSignificantEffect.isNonCodingGene() )
- annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
-
- annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
- annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
- if ( mostSignificantEffect.hasEffectExtraInformation() )
- annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
-
- if ( mostSignificantEffect.hasOldAndNewAA() )
- annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
- if ( mostSignificantEffect.hasOldAndNewCodon() )
- annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
- if ( mostSignificantEffect.hasCodonNum() )
- annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
- if ( mostSignificantEffect.hasCdsSize() )
- annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
-
- return annotations;
- }
-
public List getKeyNames() {
- return Arrays.asList( GENE_ID_KEY,
- GENE_NAME_KEY,
- TRANSCRIPT_ID_KEY,
- EXON_ID_KEY,
- EXON_RANK_KEY,
- WITHIN_NON_CODING_GENE_KEY,
- EFFECT_KEY,
- EFFECT_IMPACT_KEY,
- EFFECT_EXTRA_INFORMATION_KEY,
- OLD_NEW_AA_KEY,
- OLD_NEW_CODON_KEY,
- CODON_NUM_KEY,
- CDS_SIZE_KEY
+ return Arrays.asList( InfoFieldKey.EFF.toString(),
+ InfoFieldKey.EFF_IMPACT.toString(),
+ InfoFieldKey.EFF_CODON_CHANGE.toString(),
+ InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(),
+ InfoFieldKey.EFF_GENE_NAME.toString(),
+ InfoFieldKey.EFF_GENE_BIOTYPE.toString(),
+ InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),
+ InfoFieldKey.EFF_EXON_ID.toString()
);
}
public List getDescriptions() {
return Arrays.asList(
- new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
- new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
- new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
- new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
+ new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
+
+ /**
+ * Helper class to parse, validate, and store a single SnpEff effect and its metadata.
+ */
+ protected static class SnpEffEffect {
+ private EffectType effect;
+ private EffectImpact impact;
+ private String codonChange;
+ private String aminoAcidChange;
+ private String geneName;
+ private String geneBiotype;
+ private EffectCoding coding;
+ private String transcriptID;
+ private String exonID;
+
+ private String parseError = null;
+ private boolean isWellFormed = true;
+
+ private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
+
+ // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
+ // errors come after warnings, not vice versa:
+ private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
+ private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
+
+ private static final int SNPEFF_CODING_FIELD_INDEX = 5;
+
+ public SnpEffEffect ( String effectName, String[] effectMetadata ) {
+ parseEffectName(effectName);
+ parseEffectMetadata(effectMetadata);
+ }
+
+ private void parseEffectName ( String effectName ) {
+ try {
+ effect = EffectType.valueOf(effectName);
+ }
+ catch ( IllegalArgumentException e ) {
+ parseError(String.format("%s is not a recognized effect type", effectName));
+ }
+ }
+
+ private void parseEffectMetadata ( String[] effectMetadata ) {
+ if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
+ if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
+ parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
+ }
+ else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
+ parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
+ }
+ else {
+ parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
+ EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
+ }
+
+ return;
+ }
+
+ try {
+ impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]);
+ }
+ catch ( IllegalArgumentException e ) {
+ parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]));
+ }
+
+ codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()];
+ aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()];
+ geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()];
+ geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()];
+
+ if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
+ try {
+ coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
+ }
+ catch ( IllegalArgumentException e ) {
+ parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
+ }
+ }
+ else {
+ coding = EffectCoding.UNKNOWN;
+ }
+
+ transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()];
+ exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()];
+ }
+
+ private void parseError ( String message ) {
+ isWellFormed = false;
+
+ // Cache only the first error encountered:
+ if ( parseError == null ) {
+ parseError = message;
+ }
+ }
+
+ public boolean isWellFormed() {
+ return isWellFormed;
+ }
+
+ public String getParseError() {
+ return parseError == null ? "" : parseError;
+ }
+
+ public boolean isCoding() {
+ return coding == EffectCoding.CODING;
+ }
+
+ public boolean isHigherImpactThan ( SnpEffEffect other ) {
+ // If one effect is within a coding gene and the other is not, the effect that is
+ // within the coding gene has higher impact:
+
+ if ( isCoding() && ! other.isCoding() ) {
+ return true;
+ }
+ else if ( ! isCoding() && other.isCoding() ) {
+ return false;
+ }
+
+ // Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
+ // of the effects themselves:
+
+ return impact.isHigherImpactThan(other.impact);
+ }
+
+ public Map getAnnotations() {
+ Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length));
+
+ addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString());
+ addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString());
+ addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange);
+ addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange);
+ addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName);
+ addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype);
+ addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID);
+ addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID);
+
+ return annotations;
+ }
+
+ private void addAnnotation ( Map annotations, String keyName, String keyValue ) {
+ // Only add annotations for keys associated with non-empty values:
+ if ( keyValue != null && keyValue.trim().length() > 0 ) {
+ annotations.put(keyName, keyValue);
+ }
+ }
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 971727727..fb3dbc3cf 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@@ -93,8 +92,8 @@ public class VariantAnnotator extends RodWalker implements Ann
* listed in the SnpEff output file for each variant.
*/
@Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
- public RodBinding snpEffFile;
- public RodBinding getSnpEffRodBinding() { return snpEffFile; }
+ public RodBinding snpEffFile;
+ public RodBinding getSnpEffRodBinding() { return snpEffFile; }
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
@@ -204,9 +203,9 @@ public class VariantAnnotator extends RodWalker implements Ann
}
if ( USE_ALL_ANNOTATIONS )
- engine = new VariantAnnotatorEngine(this);
+ engine = new VariantAnnotatorEngine(this, getToolkit());
else
- engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
+ engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
engine.invokeAnnotationInitializationMethods();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 17830f129..68cd07803 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -46,6 +47,7 @@ public class VariantAnnotatorEngine {
private HashMap, String> dbAnnotations = new HashMap, String>();
private AnnotatorCompatibleWalker walker;
+ private GenomeAnalysisEngine toolkit;
private static class VAExpression {
@@ -71,16 +73,18 @@ public class VariantAnnotatorEngine {
}
// use this constructor if you want all possible annotations
- public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
+ public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
+ this.toolkit = toolkit;
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
initializeDBs();
}
// use this constructor if you want to select specific annotations (and/or interfaces)
- public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker) {
+ public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
+ this.toolkit = toolkit;
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
initializeDBs();
}
@@ -112,11 +116,11 @@ public class VariantAnnotatorEngine {
public void invokeAnnotationInitializationMethods() {
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
- annotation.initialize(walker);
+ annotation.initialize(walker, toolkit);
}
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
- annotation.initialize(walker);
+ annotation.initialize(walker, toolkit);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
index 9dda57ae3..7200f841b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
@@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.commandline.RodBinding;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
@@ -10,8 +9,8 @@ public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
public abstract RodBinding getVariantRodBinding();
- public abstract RodBinding getSnpEffRodBinding();
+ public abstract RodBinding getSnpEffRodBinding();
public abstract RodBinding getDbsnpRodBinding();
public abstract List> getCompRodBindings();
public abstract List> getResourceRodBindings();
-}
\ No newline at end of file
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
index 9e48de9c3..160a3d258 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
@@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import java.util.List;
@@ -34,5 +35,5 @@ public abstract class VariantAnnotatorAnnotation {
public abstract List getKeyNames();
// initialization method (optional for subclasses, and therefore non-abstract)
- public void initialize ( AnnotatorCompatibleWalker walker ) { }
+ public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { }
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index 4ee2d5f44..428f97e2a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@@ -128,7 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding getVariantRodBinding() { return null; }
- public RodBinding getSnpEffRodBinding() { return null; }
+ public RodBinding getSnpEffRodBinding() { return null; }
public List> getCompRodBindings() { return Collections.emptyList(); }
public List> getResourceRodBindings() { return Collections.emptyList(); }
@@ -211,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker
- * This format has 23 tab-delimited fields:
- *
- *
- * Chromosome
- * Position
- * Reference
- * Change
- * Change Type: {SNP, MNP, INS, DEL}
- * Zygosity: {Hom, Het}
- * Quality
- * Coverage
- * Warnings
- * Gene ID
- * Gene Name
- * Bio Type
- * Transcript ID
- * Exon ID
- * Exon Rank
- * Effect
- * Old/New Amino Acid
- * Old/New Codon
- * Codon Num
- * CDS Size
- * Codons Around
- * Amino Acids Around
- * Custom Interval ID
- *
- * Note that we treat all except the Chromosome, Position, and Effect fields as optional.
- *
- *
- *
- * See also: @see SNPEff project page
- *
- *
- * @author David Roazen
- * @since 2011
- */
-public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {
-
- public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
- public static final String FIELD_DELIMITER_PATTERN = "\\t";
- public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]";
- public static final String HEADER_LINE_START = "# ";
- public static final String[] HEADER_FIELD_NAMES = { "Chromo",
- "Position",
- "Reference",
- "Change",
- "Change type",
- "Homozygous",
- "Quality",
- "Coverage",
- "Warnings",
- "Gene_ID",
- "Gene_name",
- "Bio_type",
- "Trancript_ID", // yes, this is how it's spelled in the SnpEff output
- "Exon_ID",
- "Exon_Rank",
- "Effect",
- "old_AA/new_AA",
- "Old_codon/New_codon",
- "Codon_Num(CDS)",
- "CDS_size",
- "Codons around",
- "AAs around",
- "Custom_interval_ID"
- };
-
- // The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
- public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
-
- public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
-
-
- public Feature decodeLoc ( String line ) {
- return decode(line);
- }
-
- public Feature decode ( String line ) {
- String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1);
-
- if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
- throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS +
- ") number of fields: found " + tokens.length + " fields.", line);
- }
-
- try {
- trimAllFields(tokens);
- checkForRequiredFields(tokens, line);
-
- String contig = tokens[0];
- long position = Long.parseLong(tokens[1]);
-
- String reference = tokens[2].isEmpty() ? null : tokens[2];
- String change = tokens[3].isEmpty() ? null : tokens[3];
- ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]);
- Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]);
- Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]);
- Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]);
- String warnings = tokens[8].isEmpty() ? null : tokens[8];
- String geneID = tokens[9].isEmpty() ? null : tokens[9];
- String geneName = tokens[10].isEmpty() ? null : tokens[10];
- String bioType = tokens[11].isEmpty() ? null : tokens[11];
- String transcriptID = tokens[12].isEmpty() ? null : tokens[12];
- String exonID = tokens[13].isEmpty() ? null : tokens[13];
- Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
-
- boolean isNonCodingGene = isNonCodingGene(tokens[15]);
-
- // Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
- // otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
- // in the final field (when present) from being inappropriately tokenized:
-
- int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
- String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
- EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
- String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene);
-
- String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16];
- String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17];
- Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]);
- Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]);
- String codonsAround = tokens[20].isEmpty() ? null : tokens[20];
- String aasAround = tokens[21].isEmpty() ? null : tokens[21];
- String customIntervalID = tokens[22].isEmpty() ? null : tokens[22];
-
- return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage,
- warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene,
- effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize,
- codonsAround, aasAround, customIntervalID);
- }
- catch ( NumberFormatException e ) {
- throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line);
- }
- catch ( IllegalArgumentException e ) {
- throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line);
- }
- }
-
- private void trimAllFields ( String[] tokens ) {
- for ( int i = 0; i < tokens.length; i++ ) {
- tokens[i] = tokens[i].trim();
- }
- }
-
- private void checkForRequiredFields ( String[] tokens, String line ) {
- for ( int requiredFieldIndex : REQUIRED_FIELDS ) {
- if ( tokens[requiredFieldIndex].isEmpty() ) {
- throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" +
- HEADER_FIELD_NAMES[requiredFieldIndex] + "\"",
- line);
- }
- }
- }
-
- private boolean isNonCodingGene ( String effectField ) {
- return effectField.startsWith(NON_CODING_GENE_FLAG);
- }
-
- private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
- String effectName = "";
-
- // If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
- // otherwise it will be in the first subfield:
-
- if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
- effectName = effectFieldTokens[1].trim();
- }
- else {
- effectName = effectFieldTokens[0].trim();
- }
-
- return EffectType.valueOf(effectName);
- }
-
- private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
-
- // The extra effect-related information, if present, will always be the last subfield:
-
- if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
- return effectFieldTokens[effectFieldTokens.length - 1].trim();
- }
-
- return null;
- }
-
- public Class getFeatureType() {
- return SnpEffFeature.class;
- }
-
- public Object readHeader ( LineReader reader ) {
- String headerLine = "";
-
- try {
- headerLine = reader.readLine();
- }
- catch ( IOException e ) {
- throw new TribbleException("Unable to read header line from input file.");
- }
-
- validateHeaderLine(headerLine);
- return headerLine;
- }
-
- private void validateHeaderLine ( String headerLine ) {
- if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) {
- throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START);
- }
-
- String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN);
-
- if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
- throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" +
- EXPECTED_NUMBER_OF_FIELDS + ") of columns.");
- }
-
- for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) {
- if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) {
- throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" +
- HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" +
- headerTokens[columnIndex] + "\"");
- }
- }
- }
-
- public boolean canDecode ( final File potentialInput ) {
- try {
- LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput));
- readHeader(reader);
- }
- catch ( Exception e ) {
- return false;
- }
-
- return true;
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java
deleted file mode 100644
index 270db470f..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.codecs.snpEff;
-
-/**
- * A set of constants associated with the SnpEff codec.
- *
- * @author David Roazen
- */
-public class SnpEffConstants {
-
- // Possible SnpEff biological effects and their associated impacts:
- public enum EffectType {
- START_GAINED (EffectImpact.HIGH),
- START_LOST (EffectImpact.HIGH),
- EXON_DELETED (EffectImpact.HIGH),
- FRAME_SHIFT (EffectImpact.HIGH),
- STOP_GAINED (EffectImpact.HIGH),
- STOP_LOST (EffectImpact.HIGH),
- SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH),
- SPLICE_SITE_DONOR (EffectImpact.HIGH),
-
- NON_SYNONYMOUS_CODING (EffectImpact.MODERATE),
- UTR_5_DELETED (EffectImpact.MODERATE),
- UTR_3_DELETED (EffectImpact.MODERATE),
- CODON_INSERTION (EffectImpact.MODERATE),
- CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE),
- CODON_DELETION (EffectImpact.MODERATE),
- CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE),
-
- NONE (EffectImpact.LOW),
- CHROMOSOME (EffectImpact.LOW),
- INTERGENIC (EffectImpact.LOW),
- UPSTREAM (EffectImpact.LOW),
- UTR_5_PRIME (EffectImpact.LOW),
- SYNONYMOUS_START (EffectImpact.LOW),
- NON_SYNONYMOUS_START (EffectImpact.LOW),
- CDS (EffectImpact.LOW),
- GENE (EffectImpact.LOW),
- TRANSCRIPT (EffectImpact.LOW),
- EXON (EffectImpact.LOW),
- SYNONYMOUS_CODING (EffectImpact.LOW),
- CODON_CHANGE (EffectImpact.LOW),
- SYNONYMOUS_STOP (EffectImpact.LOW),
- NON_SYNONYMOUS_STOP (EffectImpact.LOW),
- INTRON (EffectImpact.LOW),
- UTR_3_PRIME (EffectImpact.LOW),
- DOWNSTREAM (EffectImpact.LOW),
- INTRON_CONSERVED (EffectImpact.LOW),
- INTERGENIC_CONSERVED (EffectImpact.LOW),
- CUSTOM (EffectImpact.LOW);
-
- private final EffectImpact impact;
-
- EffectType ( EffectImpact impact ) {
- this.impact = impact;
- }
-
- public EffectImpact getImpact() {
- return impact;
- }
- }
-
- public enum EffectImpact {
- LOW (1),
- MODERATE (2),
- HIGH (3);
-
- private final int severityRating;
-
- EffectImpact ( int severityRating ) {
- this.severityRating = severityRating;
- }
-
- public boolean isHigherImpactThan ( EffectImpact other ) {
- return this.severityRating > other.severityRating;
- }
- }
-
- // The kinds of variants supported by the SnpEff output format:
- public enum ChangeType {
- SNP,
- MNP,
- INS,
- DEL
- }
-
- // Possible zygosities of SnpEff variants:
- public enum Zygosity {
- Hom,
- Het
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java
deleted file mode 100644
index 2f120b7d2..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.codecs.snpEff;
-
-import org.broad.tribble.Feature;
-
-import java.util.NoSuchElementException;
-
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
-
-/**
- * Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
- *
- * Many fields are optional, and missing values are represented by nulls. You should always call the
- * hasX() method before calling the corresponding getX() method. Required fields can never be null
- * and do not have a hasX() method.
- *
- * @author David Roazen
- */
-public class SnpEffFeature implements Feature {
-
- private String contig; // REQUIRED FIELD
- private long position; // REQUIRED FIELD
- private String reference;
- private String change;
- private ChangeType changeType;
- private Zygosity zygosity;
- private Double quality;
- private Long coverage;
- private String warnings;
- private String geneID;
- private String geneName;
- private String bioType;
- private String transcriptID;
- private String exonID;
- private Integer exonRank;
- private boolean isNonCodingGene; // REQUIRED FIELD
- private EffectType effect; // REQUIRED FIELD
- private String effectExtraInformation;
- private String oldAndNewAA;
- private String oldAndNewCodon;
- private Integer codonNum;
- private Integer cdsSize;
- private String codonsAround;
- private String aasAround;
- private String customIntervalID;
-
- public SnpEffFeature ( String contig,
- long position,
- String reference,
- String change,
- ChangeType changeType,
- Zygosity zygosity,
- Double quality,
- Long coverage,
- String warnings,
- String geneID,
- String geneName,
- String bioType,
- String transcriptID,
- String exonID,
- Integer exonRank,
- boolean isNonCodingGene,
- EffectType effect,
- String effectExtraInformation,
- String oldAndNewAA,
- String oldAndNewCodon,
- Integer codonNum,
- Integer cdsSize,
- String codonsAround,
- String aasAround,
- String customIntervalID ) {
-
- if ( contig == null || effect == null ) {
- throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
- }
-
- this.contig = contig;
- this.position = position;
- this.reference = reference;
- this.change = change;
- this.changeType = changeType;
- this.zygosity = zygosity;
- this.quality = quality;
- this.coverage = coverage;
- this.warnings = warnings;
- this.geneID = geneID;
- this.geneName = geneName;
- this.bioType = bioType;
- this.transcriptID = transcriptID;
- this.exonID = exonID;
- this.exonRank = exonRank;
- this.isNonCodingGene = isNonCodingGene;
- this.effect = effect;
- this.effectExtraInformation = effectExtraInformation;
- this.oldAndNewAA = oldAndNewAA;
- this.oldAndNewCodon = oldAndNewCodon;
- this.codonNum = codonNum;
- this.cdsSize = cdsSize;
- this.codonsAround = codonsAround;
- this.aasAround = aasAround;
- this.customIntervalID = customIntervalID;
- }
-
- public boolean isHigherImpactThan ( SnpEffFeature other ) {
-
- // If one effect is in a non-coding gene and the other is not, the effect NOT in the
- // non-coding gene has higher impact:
-
- if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
- return true;
- }
- else if ( isNonCodingGene() && ! other.isNonCodingGene() ) {
- return false;
- }
-
- // Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
- // of the effects themselves as defined in the SnpEffConstants class:
-
- return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
- }
-
- public String getChr() {
- return contig;
- }
-
- public int getStart() {
- return (int)position;
- }
-
- public int getEnd() {
- return (int)position;
- }
-
- public boolean hasReference() {
- return reference != null;
- }
-
- public String getReference() {
- if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
- return reference;
- }
-
- public boolean hasChange() {
- return change != null;
- }
-
- public String getChange() {
- if ( change == null ) throw new NoSuchElementException("This feature has no change field");
- return change;
- }
-
- public boolean hasChangeType() {
- return changeType != null;
- }
-
- public ChangeType getChangeType() {
- if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
- return changeType;
- }
-
- public boolean hasZygosity() {
- return zygosity != null;
- }
-
- public Zygosity getZygosity() {
- if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
- return zygosity;
- }
-
- public boolean hasQuality() {
- return quality != null;
- }
-
- public Double getQuality() {
- if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
- return quality;
- }
-
- public boolean hasCoverage() {
- return coverage != null;
- }
-
- public Long getCoverage() {
- if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
- return coverage;
- }
-
- public boolean hasWarnings() {
- return warnings != null;
- }
-
- public String getWarnings() {
- if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
- return warnings;
- }
-
- public boolean hasGeneID() {
- return geneID != null;
- }
-
- public String getGeneID() {
- if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
- return geneID;
- }
-
- public boolean hasGeneName() {
- return geneName != null;
- }
-
- public String getGeneName() {
- if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
- return geneName;
- }
-
- public boolean hasBioType() {
- return bioType != null;
- }
-
- public String getBioType() {
- if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
- return bioType;
- }
-
- public boolean hasTranscriptID() {
- return transcriptID != null;
- }
-
- public String getTranscriptID() {
- if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
- return transcriptID;
- }
-
- public boolean hasExonID() {
- return exonID != null;
- }
-
- public String getExonID() {
- if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
- return exonID;
- }
-
- public boolean hasExonRank() {
- return exonRank != null;
- }
-
- public Integer getExonRank() {
- if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
- return exonRank;
- }
-
- public boolean isNonCodingGene() {
- return isNonCodingGene;
- }
-
- public EffectType getEffect() {
- return effect;
- }
-
- public EffectImpact getEffectImpact() {
- return effect.getImpact();
- }
-
- public boolean hasEffectExtraInformation() {
- return effectExtraInformation != null;
- }
-
- public String getEffectExtraInformation() {
- if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
- return effectExtraInformation;
- }
-
- public boolean hasOldAndNewAA() {
- return oldAndNewAA != null;
- }
-
- public String getOldAndNewAA() {
- if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
- return oldAndNewAA;
- }
-
- public boolean hasOldAndNewCodon() {
- return oldAndNewCodon != null;
- }
-
- public String getOldAndNewCodon() {
- if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
- return oldAndNewCodon;
- }
-
- public boolean hasCodonNum() {
- return codonNum != null;
- }
-
- public Integer getCodonNum() {
- if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
- return codonNum;
- }
-
- public boolean hasCdsSize() {
- return cdsSize != null;
- }
-
- public Integer getCdsSize() {
- if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
- return cdsSize;
- }
-
- public boolean hasCodonsAround() {
- return codonsAround != null;
- }
-
- public String getCodonsAround() {
- if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
- return codonsAround;
- }
-
- public boolean hadAasAround() {
- return aasAround != null;
- }
-
- public String getAasAround() {
- if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
- return aasAround;
- }
-
- public boolean hasCustomIntervalID() {
- return customIntervalID != null;
- }
-
- public String getCustomIntervalID() {
- if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
- return customIntervalID;
- }
-
- public boolean equals ( Object o ) {
- if ( o == null || ! (o instanceof SnpEffFeature) ) {
- return false;
- }
-
- SnpEffFeature other = (SnpEffFeature)o;
-
- return contig.equals(other.contig) &&
- position == other.position &&
- (reference == null ? other.reference == null : reference.equals(other.reference)) &&
- (change == null ? other.change == null : change.equals(other.change)) &&
- changeType == other.changeType &&
- zygosity == other.zygosity &&
- (quality == null ? other.quality == null : quality.equals(other.quality)) &&
- (coverage == null ? other.coverage == null : coverage.equals(other.coverage)) &&
- (warnings == null ? other.warnings == null : warnings.equals(other.warnings)) &&
- (geneID == null ? other.geneID == null : geneID.equals(other.geneID)) &&
- (geneName == null ? other.geneName == null : geneName.equals(other.geneName)) &&
- (bioType == null ? other.bioType == null : bioType.equals(other.bioType)) &&
- (transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) &&
- (exonID == null ? other.exonID == null : exonID.equals(other.exonID)) &&
- (exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) &&
- isNonCodingGene == other.isNonCodingGene &&
- effect == other.effect &&
- (effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) &&
- (oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) &&
- (oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) &&
- (codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) &&
- (cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) &&
- (codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) &&
- (aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) &&
- (customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID));
- }
-
- public String toString() {
- return "[Contig: " + contig +
- " Position: " + position +
- " Reference: " + reference +
- " Change: " + change +
- " Change Type: " + changeType +
- " Zygosity: " + zygosity +
- " Quality: " + quality +
- " Coverage: " + coverage +
- " Warnings: " + warnings +
- " Gene ID: " + geneID +
- " Gene Name: " + geneName +
- " Bio Type: " + bioType +
- " Transcript ID: " + transcriptID +
- " Exon ID: " + exonID +
- " Exon Rank: " + exonRank +
- " Non-Coding Gene: " + isNonCodingGene +
- " Effect: " + effect +
- " Effect Extra Information: " + effectExtraInformation +
- " Old/New AA: " + oldAndNewAA +
- " Old/New Codon: " + oldAndNewCodon +
- " Codon Num: " + codonNum +
- " CDS Size: " + cdsSize +
- " Codons Around: " + codonsAround +
- " AAs Around: " + aasAround +
- " Custom Interval ID: " + customIntervalID +
- "]";
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
index eb01e5dca..fd1c74993 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@@ -24,6 +24,7 @@ public class VCFHeader {
private final Set mMetaData;
private final Map mInfoMetaData = new HashMap();
private final Map mFormatMetaData = new HashMap();
+ private final Map mOtherMetaData = new HashMap();
// the list of auxillary tags
private final Set mGenotypeSampleNames = new LinkedHashSet();
@@ -110,6 +111,9 @@ public class VCFHeader {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getName(), formatLine);
}
+ else {
+ mOtherMetaData.put(line.getKey(), line);
+ }
}
}
@@ -185,6 +189,14 @@ public class VCFHeader {
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
return mFormatMetaData.get(key);
}
+
+ /**
+ * @param key the header key name
+ * @return the meta data line, or null if there is none
+ */
+ public VCFHeaderLine getOtherHeaderLine(String key) {
+ return mOtherMetaData.get(key);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 1c65102ae..cfd59b504 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -817,6 +817,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati
throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this);
}
+ /**
+ * @param other VariantContext whose alternate alleles to compare against
+ * @return true if this VariantContext has the same alternate alleles as other,
+ * regardless of ordering. Otherwise returns false.
+ */
+ public boolean hasSameAlternateAllelesAs ( VariantContext other ) {
+ Set thisAlternateAlleles = getAlternateAlleles();
+ Set otherAlternateAlleles = other.getAlternateAlleles();
+
+ if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) {
+ return false;
+ }
+
+ for ( Allele allele : thisAlternateAlleles ) {
+ if ( ! otherAlternateAlleles.contains(allele) ) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
// ---------------------------------------------------------------------------------------------------------
//
// Working with genotypes
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
new file mode 100644
index 000000000..462abeba1
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.SnpEffEffect;
+
+public class SnpEffUnitTest {
+
+ @Test
+ public void testParseWellFormedEffect() {
+ String effectName = "NON_SYNONYMOUS_CODING";
+ String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertTrue( effect.isWellFormed() && effect.isCoding() );
+ }
+
+ @Test
+ public void testParseInvalidEffectNameEffect() {
+ String effectName = "MADE_UP_EFFECT";
+ String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertFalse(effect.isWellFormed());
+ }
+
+ @Test
+ public void testParseInvalidEffectImpactEffect() {
+ String effectName = "NON_SYNONYMOUS_CODING";
+ String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertFalse(effect.isWellFormed());
+ }
+
+ @Test
+ public void testParseWrongNumberOfMetadataFieldsEffect() {
+ String effectName = "NON_SYNONYMOUS_CODING";
+ String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertFalse(effect.isWellFormed());
+ }
+
+ @Test
+ public void testParseSnpEffWarningEffect() {
+ String effectName = "NON_SYNONYMOUS_CODING";
+ String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") );
+ }
+
+ @Test
+ public void testParseSnpEffErrorEffect() {
+ String effectName = "NON_SYNONYMOUS_CODING";
+ String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" };
+
+ SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") );
+ }
+}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 832079807..f902ce276 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.WalkerTest;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import java.util.Arrays;
@@ -129,12 +130,24 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
@Test
public void testSnpEffAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
- "-T VariantAnnotator -R " + b37KGReference + " -NO_HEADER -o %s -A SnpEff --variant " +
- validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation +
- "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000",
+ "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
+ validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
+ "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000",
1,
- Arrays.asList("03eae1dab19a9358250890594bf53607")
+ Arrays.asList("a1c3ba9efc28ee0606339604095076ea")
);
executeTest("Testing SnpEff annotations", spec);
}
+
+ @Test
+ public void testSnpEffAnnotationsUnsupportedVersion() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
+ validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
+ "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000",
+ 1,
+ UserException.class
+ );
+ executeTest("Testing SnpEff annotations (unsupported version)", spec);
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java
deleted file mode 100644
index 6d492565b..000000000
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.codecs.snpEff;
-
-import org.apache.commons.io.input.ReaderInputStream;
-import org.broad.tribble.TribbleException;
-import org.broad.tribble.readers.AsciiLineReader;
-import org.broad.tribble.readers.LineReader;
-import org.testng.Assert;
-import org.testng.annotations.Test;
-
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
-import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
-
-import java.io.StringReader;
-
-public class SnpEffCodecUnitTest {
-
- @Test
- public void testParseWellFormedSnpEffHeaderLine() {
- String wellFormedSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" +
- "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
- "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
- "AAs around\tCustom_interval_ID";
-
- SnpEffCodec codec = new SnpEffCodec();
- LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wellFormedSnpEffHeaderLine)));
- String headerReturned = (String)codec.readHeader(reader);
-
- Assert.assertEquals(headerReturned, wellFormedSnpEffHeaderLine);
- }
-
- @Test(expectedExceptions = TribbleException.InvalidHeader.class)
- public void testParseWrongNumberOfFieldsSnpEffHeaderLine() {
- String wrongNumberOfFieldsSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" +
- "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
- "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
- "AAs around";
-
- SnpEffCodec codec = new SnpEffCodec();
- LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wrongNumberOfFieldsSnpEffHeaderLine)));
- codec.readHeader(reader);
- }
-
- @Test(expectedExceptions = TribbleException.InvalidHeader.class)
- public void testParseMisnamedColumnSnpEffHeaderLine() {
- String misnamedColumnSnpEffHeaderLine = "# Chromo\tPosition\tRef\tChange\tChange type\t" +
- "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
- "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
- "AAs around\tCustom_interval_ID";
-
- SnpEffCodec codec = new SnpEffCodec();
- LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(misnamedColumnSnpEffHeaderLine)));
- codec.readHeader(reader);
- }
-
- @Test
- public void testParseSimpleEffectSnpEffLine() {
- String simpleEffectSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
- "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t\t";
-
- SnpEffFeature expectedFeature = new SnpEffFeature("1",
- 69428l,
- "T",
- "G",
- ChangeType.SNP,
- Zygosity.Hom,
- 6049.69,
- 61573l,
- null,
- "ENSG00000177693",
- "OR4F5",
- "mRNA",
- "ENST00000326183",
- "exon_1_69055_70108",
- 1,
- false,
- EffectType.NON_SYNONYMOUS_CODING,
- null,
- "F/C",
- "TTT/TGT",
- 113,
- 918,
- null,
- null,
- null
- );
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(simpleEffectSnpEffLine);
-
- Assert.assertEquals(feature, expectedFeature);
- }
-
- @Test
- public void testParseNonCodingRegionSnpEffLine() {
- String nonCodingRegionSnpEffLine = "1\t1337592\tG\tC\tSNP\tHom\t1935.52\t21885\t\tENSG00000250188\t" +
- "RP4-758J18.5\tmRNA\tENST00000514958\texon_1_1337454_1338076\t2\tWITHIN_NON_CODING_GENE, NON_SYNONYMOUS_CODING\t" +
- "L/V\tCTA/GTA\t272\t952\t\t\t";
-
- SnpEffFeature expectedFeature = new SnpEffFeature("1",
- 1337592l,
- "G",
- "C",
- ChangeType.SNP,
- Zygosity.Hom,
- 1935.52,
- 21885l,
- null,
- "ENSG00000250188",
- "RP4-758J18.5",
- "mRNA",
- "ENST00000514958",
- "exon_1_1337454_1338076",
- 2,
- true,
- EffectType.NON_SYNONYMOUS_CODING,
- null,
- "L/V",
- "CTA/GTA",
- 272,
- 952,
- null,
- null,
- null
- );
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(nonCodingRegionSnpEffLine);
-
- Assert.assertEquals(feature, expectedFeature);
- }
-
- @Test
- public void testParseExtraEffectInformationSnpEffLine() {
- String extraEffectInformationSnpEffLine = "1\t879537\tT\tC\tSNP\tHom\t341.58\t13733\t\tENSG00000187634\tSAMD11\t" +
- "mRNA\tENST00000341065\t\t\tUTR_3_PRIME: 4 bases from transcript end\t\t\t\t\t\t\t";
-
- SnpEffFeature expectedFeature = new SnpEffFeature("1",
- 879537l,
- "T",
- "C",
- ChangeType.SNP,
- Zygosity.Hom,
- 341.58,
- 13733l,
- null,
- "ENSG00000187634",
- "SAMD11",
- "mRNA",
- "ENST00000341065",
- null,
- null,
- false,
- EffectType.UTR_3_PRIME,
- "4 bases from transcript end",
- null,
- null,
- null,
- null,
- null,
- null,
- null
- );
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(extraEffectInformationSnpEffLine);
-
- Assert.assertEquals(feature, expectedFeature);
- }
-
- @Test
- public void testParseMultiEffectSnpEffLine() {
- String multiEffectSnpEffLine = "1\t901901\tC\tT\tSNP\tHom\t162.91\t4646\t\tENSG00000187583\tPLEKHN1\tmRNA\t" +
- "ENST00000379410\texon_1_901877_901994\t1\tSTART_GAINED: ATG, UTR_5_PRIME: 11 bases from TSS\t\t\t\t\t\t\t";
-
- SnpEffFeature expectedFeature = new SnpEffFeature("1",
- 901901l,
- "C",
- "T",
- ChangeType.SNP,
- Zygosity.Hom,
- 162.91,
- 4646l,
- null,
- "ENSG00000187583",
- "PLEKHN1",
- "mRNA",
- "ENST00000379410",
- "exon_1_901877_901994",
- 1,
- false,
- EffectType.START_GAINED,
- "ATG, UTR_5_PRIME: 11 bases from TSS",
- null,
- null,
- null,
- null,
- null,
- null,
- null
- );
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(multiEffectSnpEffLine);
-
- Assert.assertEquals(feature, expectedFeature);
- }
-
- @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
- public void testParseWrongNumberOfFieldsSnpEffLine() {
- String wrongNumberOfFieldsSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
- "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t";
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(wrongNumberOfFieldsSnpEffLine);
- }
-
- @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
- public void testParseBlankEffectFieldSnpEffLine() {
- String blankEffectFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
- "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\t\tF/C\tTTT/TGT\t113\t918\t\t\t";
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(blankEffectFieldSnpEffLine);
- }
-
- @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
- public void testParseInvalidNumericFieldSnpEffLine() {
- String invalidNumericFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
- "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\tfoo\t\t\t";;
-
- SnpEffCodec codec = new SnpEffCodec();
- SnpEffFeature feature = (SnpEffFeature)codec.decode(invalidNumericFieldSnpEffLine);
- }
-}
From e0c8c0ddcb48617d5f9146c2d9ffe57d967dd899 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Wed, 14 Sep 2011 06:04:32 -0400
Subject: [PATCH 26/49] Modified VariantEval FunctionalClass stratification to
remove hardcoded GenomicAnnotator keynames
This is a temporary and hopefully short-lived solution. I've modified
the FunctionalClass stratification to stratify by effect impact as
defined by SnpEff annotations (high, moderate, and low impact) rather
than by the silent/missense/nonsense categories.
If we want to bring back the silent/missense/nonsense stratification,
we should probably take the approach of asking the SnpEff author
to add it as a feature to SnpEff rather than coding it ourselves,
since the whole point of moving to SnpEff was to outsource genomic
annotation.
---
.../stratifications/FunctionalClass.java | 53 ++++++++-----------
.../VariantEvalIntegrationTest.java | 7 ++-
2 files changed, 24 insertions(+), 36 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index 193a65591..c675b111c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -2,21 +2,29 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.List;
/**
- * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
+ * Stratifies by low-, moderate-, and high-impact genomic effect using SnpEff annotations produced by VariantAnnotator
*/
public class FunctionalClass extends VariantStratifier {
+
+ public static final String LOW_IMPACT_STATE_NAME = "low-impact";
+ public static final String MODERATE_IMPACT_STATE_NAME = "moderate-impact";
+ public static final String HIGH_IMPACT_STATE_NAME = "high-impact";
+
+ public static final String EFFECT_IMPACT_ATTRIBUTE_KEY = SnpEff.InfoFieldKey.EFF_IMPACT.toString();
+
@Override
public void initialize() {
states.add("all");
- states.add("silent");
- states.add("missense");
- states.add("nonsense");
+ states.add(LOW_IMPACT_STATE_NAME);
+ states.add(MODERATE_IMPACT_STATE_NAME);
+ states.add(HIGH_IMPACT_STATE_NAME);
}
@@ -25,36 +33,17 @@ public class FunctionalClass extends VariantStratifier {
relevantStates.add("all");
- if (eval != null && eval.isVariant()) {
- String type = null;
+ if ( eval != null && eval.isVariant() && eval.hasAttribute(EFFECT_IMPACT_ATTRIBUTE_KEY) ) {
+ String effectImpact = eval.getAttributeAsString(EFFECT_IMPACT_ATTRIBUTE_KEY);
- if (eval.hasAttribute("refseq.functionalClass")) {
- type = eval.getAttributeAsString("refseq.functionalClass");
- } else if (eval.hasAttribute("refseq.functionalClass_1")) {
- int annotationId = 1;
- String key;
-
- do {
- key = String.format("refseq.functionalClass_%d", annotationId);
-
- String newtype = eval.getAttributeAsString(key);
-
- if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
- ( type == null ||
- ( type.equals("silent") && !newtype.equals("silent") ) ||
- ( type.equals("missense") && newtype.equals("nonsense") ) )
- ) {
- type = newtype;
- }
-
- annotationId++;
- } while (eval.hasAttribute(key));
+ if ( effectImpact.equals(SnpEff.EffectImpact.LOW.toString()) ) {
+ relevantStates.add(LOW_IMPACT_STATE_NAME);
}
-
- if (type != null) {
- if (type.equals("silent")) { relevantStates.add("silent"); }
- else if (type.equals("missense")) { relevantStates.add("missense"); }
- else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
+ else if ( effectImpact.equals(SnpEff.EffectImpact.MODERATE.toString()) ) {
+ relevantStates.add(MODERATE_IMPACT_STATE_NAME);
+ }
+ else if ( effectImpact.equals(SnpEff.EffectImpact.HIGH.toString()) ) {
+ relevantStates.add(HIGH_IMPACT_STATE_NAME);
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index e992684bc..00ecd5b67 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -123,9 +123,8 @@ public class VariantEvalIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T VariantEval",
- "-R " + b37KGReference,
- "--dbsnp " + b37dbSNP132,
- "--eval " + fundamentalTestVCF,
+ "-R " + hg19Reference,
+ "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
"-noEV",
"-EV CountVariants",
"-noST",
@@ -134,7 +133,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
- Arrays.asList("e40b77e7ed6581328e373a24b93cd170")
+ Arrays.asList("e93b3d66a5c150cbf1ae4262ec075d2d")
);
executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec);
}
From 3db457ed01f7d99eaa0d62b9924cba6f1d269dad Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Wed, 14 Sep 2011 10:47:28 -0400
Subject: [PATCH 27/49] Revert "Modified VariantEval FunctionalClass
stratification to remove hardcoded GenomicAnnotator keynames"
After discussing this with Mark, it seems clear that the old version of the
VariantEval FunctionalClass stratification is preferable to this version.
By reverting, we maintain backwards compatibility with legacy output files
from the old GenomicAnnotator, and can add SnpEff support later without
breaking that backwards compatibility.
This reverts commit b44acd1abd9ab6eec37111a19fa797f9e2ca3326.
---
.../stratifications/FunctionalClass.java | 53 +++++++++++--------
.../VariantEvalIntegrationTest.java | 7 +--
2 files changed, 36 insertions(+), 24 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index c675b111c..193a65591 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -2,29 +2,21 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.List;
/**
- * Stratifies by low-, moderate-, and high-impact genomic effect using SnpEff annotations produced by VariantAnnotator
+ * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
*/
public class FunctionalClass extends VariantStratifier {
-
- public static final String LOW_IMPACT_STATE_NAME = "low-impact";
- public static final String MODERATE_IMPACT_STATE_NAME = "moderate-impact";
- public static final String HIGH_IMPACT_STATE_NAME = "high-impact";
-
- public static final String EFFECT_IMPACT_ATTRIBUTE_KEY = SnpEff.InfoFieldKey.EFF_IMPACT.toString();
-
@Override
public void initialize() {
states.add("all");
- states.add(LOW_IMPACT_STATE_NAME);
- states.add(MODERATE_IMPACT_STATE_NAME);
- states.add(HIGH_IMPACT_STATE_NAME);
+ states.add("silent");
+ states.add("missense");
+ states.add("nonsense");
}
@@ -33,17 +25,36 @@ public class FunctionalClass extends VariantStratifier {
relevantStates.add("all");
- if ( eval != null && eval.isVariant() && eval.hasAttribute(EFFECT_IMPACT_ATTRIBUTE_KEY) ) {
- String effectImpact = eval.getAttributeAsString(EFFECT_IMPACT_ATTRIBUTE_KEY);
+ if (eval != null && eval.isVariant()) {
+ String type = null;
- if ( effectImpact.equals(SnpEff.EffectImpact.LOW.toString()) ) {
- relevantStates.add(LOW_IMPACT_STATE_NAME);
+ if (eval.hasAttribute("refseq.functionalClass")) {
+ type = eval.getAttributeAsString("refseq.functionalClass");
+ } else if (eval.hasAttribute("refseq.functionalClass_1")) {
+ int annotationId = 1;
+ String key;
+
+ do {
+ key = String.format("refseq.functionalClass_%d", annotationId);
+
+ String newtype = eval.getAttributeAsString(key);
+
+ if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
+ ( type == null ||
+ ( type.equals("silent") && !newtype.equals("silent") ) ||
+ ( type.equals("missense") && newtype.equals("nonsense") ) )
+ ) {
+ type = newtype;
+ }
+
+ annotationId++;
+ } while (eval.hasAttribute(key));
}
- else if ( effectImpact.equals(SnpEff.EffectImpact.MODERATE.toString()) ) {
- relevantStates.add(MODERATE_IMPACT_STATE_NAME);
- }
- else if ( effectImpact.equals(SnpEff.EffectImpact.HIGH.toString()) ) {
- relevantStates.add(HIGH_IMPACT_STATE_NAME);
+
+ if (type != null) {
+ if (type.equals("silent")) { relevantStates.add("silent"); }
+ else if (type.equals("missense")) { relevantStates.add("missense"); }
+ else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index 00ecd5b67..e992684bc 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -123,8 +123,9 @@ public class VariantEvalIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T VariantEval",
- "-R " + hg19Reference,
- "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "-R " + b37KGReference,
+ "--dbsnp " + b37dbSNP132,
+ "--eval " + fundamentalTestVCF,
"-noEV",
"-EV CountVariants",
"-noST",
@@ -133,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
- Arrays.asList("e93b3d66a5c150cbf1ae4262ec075d2d")
+ Arrays.asList("e40b77e7ed6581328e373a24b93cd170")
);
executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec);
}
From a942fa38ef87c1e1565d6a1cff041d8a62aaeb0a Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Thu, 15 Sep 2011 10:22:28 -0400
Subject: [PATCH 28/49] Refine the way we merge records in CombineVariants of
different types. As of before, two records of different types were not
combined and were kept separate. This is still the case, except when the
alleles of one record are a strict subset of alleles of another record. For
example, a SNP with alleles {A*,T} and a mixed record with alleles {A*,T,
AAT} are now combined when start position matches.
---
.../walkers/variantutils/CombineVariants.java | 35 +++++++++++++++++--
.../variantcontext/VariantContextUtils.java | 12 +++++++
2 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
index 7062f17e5..3e3b29a7f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@@ -234,16 +234,47 @@ public class CombineVariants extends RodWalker {
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
return 0;
- List mergedVCs = new ArrayList();
+ List preMergedVCs = new ArrayList();
Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for ( VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) )
- mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
+ preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
+ List mergedVCs = new ArrayList();
+ // se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record,
+ // we will still merge those records.
+ if (preMergedVCs.size() > 1) {
+ for (VariantContext vc1 : preMergedVCs) {
+ VariantContext newvc = vc1;
+ boolean merged = false;
+ for (int k=0; k < mergedVCs.size(); k++) {
+ VariantContext vc2 = mergedVCs.get(k);
+
+ if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) {
+ // all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2
+ List vcpair = new ArrayList();
+ vcpair.add(vc1);
+ vcpair.add(vc2);
+ newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair,
+ priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
+ SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
+ mergedVCs.set(k,newvc);
+ merged = true;
+ break;
+ }
+ }
+ if (!merged)
+ mergedVCs.add(vc1);
+ }
+ }
+ else {
+ mergedVCs = preMergedVCs;
+ }
+
for ( VariantContext mergedVC : mergedVCs ) {
// only operate at the start of events
if ( mergedVC == null )
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
index 986d6305c..506bb3b33 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@@ -663,6 +663,18 @@ public class VariantContextUtils {
return merged;
}
+ public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
+ // if all alleles of vc1 are a contained in alleles of vc2, return true
+ if (!vc1.getReference().equals(vc2.getReference()))
+ return false;
+
+ for (Allele a :vc1.getAlternateAlleles()) {
+ if (!vc2.getAlternateAlleles().contains(a))
+ return false;
+ }
+
+ return true;
+ }
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
boolean trimVC;
From 1e682deb26aad12d4421cf9ff7f08318c1cd4ab3 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Thu, 15 Sep 2011 13:07:50 -0400
Subject: [PATCH 29/49] Minor html-formatting-related documentation fix to the
SnpEff class.
---
.../org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index 14abbca5b..bb3685fb5 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -45,7 +45,7 @@ import java.util.*;
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
- * output file (which must be provided on the command line via --snpEffFile .vcf),
+ * output file (which must be provided on the command line via --snpEffFile filename.vcf),
* and adds annotations on that effect.
*
* @author David Roazen
From 202405b1a165db3f1d97f687da10826df181c849 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 13:52:31 -0400
Subject: [PATCH 30/49] Updating the FunctionalClass stratification in
VariantEval to handle the snpEff annotations; this change really needs to be
in before the release so that the pipeline can output semi-meaningful plots.
This commit maintains backwards compatibility with the crappy Genomic
Annotator output. However, I did clean up the code a bit so that we now use
an Enum instead of hard-coded values (so it's now much easier to change
things if we choose to do so in the future). I do not see this as the final
commit on this topic - I think we need to make some changes to the snpEff
annotator to preferentially choose certain annotations within effect classes;
Mark, let's chat about this for a bit when you get back next week. Also, for
the record, I should be blamed for David's temporary commit the other day
because I gave him the green light (since when do you care about backwards
compatibility anyways?). In any case, at least now we have something that
works for both the old and new annotations.
---
.../stratifications/FunctionalClass.java | 53 +++++++++++++------
.../VariantEvalIntegrationTest.java | 25 ++++++++-
2 files changed, 59 insertions(+), 19 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index 193a65591..a32857ffc 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
@@ -11,12 +12,19 @@ import java.util.List;
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
*/
public class FunctionalClass extends VariantStratifier {
+
+ public enum FunctionalType {
+ silent,
+ missense,
+ nonsense
+ }
+
+
@Override
public void initialize() {
states.add("all");
- states.add("silent");
- states.add("missense");
- states.add("nonsense");
+ for ( FunctionalType type : FunctionalType.values() )
+ states.add(type.name());
}
@@ -26,10 +34,12 @@ public class FunctionalClass extends VariantStratifier {
relevantStates.add("all");
if (eval != null && eval.isVariant()) {
- String type = null;
+ FunctionalType type = null;
if (eval.hasAttribute("refseq.functionalClass")) {
- type = eval.getAttributeAsString("refseq.functionalClass");
+ try {
+ type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass"));
+ } catch ( Exception e ) {} // don't error out if the type isn't supported
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
- String newtype = eval.getAttributeAsString(key);
-
- if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
- ( type == null ||
- ( type.equals("silent") && !newtype.equals("silent") ) ||
- ( type.equals("missense") && newtype.equals("nonsense") ) )
- ) {
- type = newtype;
+ String newtypeStr = eval.getAttributeAsString(key);
+ if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
+ try {
+ FunctionalType newType = FunctionalType.valueOf(newtypeStr);
+ if ( type == null ||
+ ( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
+ ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
+ type = newType;
+ }
+ } catch ( Exception e ) {} // don't error out if the type isn't supported
}
annotationId++;
} while (eval.hasAttribute(key));
+
+ } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFF.name() ) ) {
+ SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFF.name()).toString());
+ if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
+ type = FunctionalType.nonsense;
+ else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
+ type = FunctionalType.missense;
+ else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING )
+ type = FunctionalType.silent;
}
- if (type != null) {
- if (type.equals("silent")) { relevantStates.add("silent"); }
- else if (type.equals("missense")) { relevantStates.add("missense"); }
- else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
+ if ( type != null ) {
+ relevantStates.add(type.name());
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index e992684bc..d8f7ad3b6 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -6,7 +6,7 @@ import org.testng.annotations.Test;
import java.util.Arrays;
public class VariantEvalIntegrationTest extends WalkerTest {
- private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval";
+ private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval";
private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf";
private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf";
@@ -14,6 +14,27 @@ public class VariantEvalIntegrationTest extends WalkerTest {
private static String cmdRoot = "-T VariantEval" +
" -R " + b36KGReference;
+ @Test
+ public void testFunctionClassWithSnpeff() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T VariantEval",
+ "-R " + b37KGReference,
+ "--dbsnp " + b37dbSNP132,
+ "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "-noEV",
+ "-EV TiTvVariantEvaluator",
+ "-noST",
+ "-ST FunctionalClass",
+ "-BTI eval",
+ "-o %s"
+ ),
+ 1,
+ Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12")
+ );
+ executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec);
+ }
+
@Test
public void testStratifySamplesAndExcludeMonomorphicSites() {
WalkerTestSpec spec = new WalkerTestSpec(
@@ -21,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
- "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf",
+ "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf",
"-noEV",
"-EV TiTvVariantEvaluator",
"-ST Sample",
From d369d105932b457a01b47166b7b7a1cd41d6b337 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 13:56:23 -0400
Subject: [PATCH 31/49] Adding documentation before the release for GATK wiki
page
---
.../broadinstitute/sting/gatk/filters/PlatformFilter.java | 2 +-
.../sting/gatk/walkers/PrintReadsWalker.java | 7 +++++++
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java
index 30b2f828d..8e241bb2c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java
@@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
* @version 0.1
*/
public class PlatformFilter extends ReadFilter {
- @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
+ @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
protected String[] PLFilterNames;
public boolean filterOut(SAMRecord rec) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
index fdfac6bf7..4f072e88c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
@@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
* -I input1.bam \
* -I input2.bam \
* --read_filter MappingQualityZero
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PrintReads \
+ * -o output.bam \
+ * -I input.bam \
+ * -n 2000
*
*
*/
From ce73dc40712510a360738df49e802b4c21d95621 Mon Sep 17 00:00:00 2001
From: Christopher Hartl
Date: Thu, 15 Sep 2011 15:33:09 -0400
Subject: [PATCH 34/49] Update to the bindings for liftOverVCF.pl (to -V from
-B)
---
public/perl/liftOverVCF.pl | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl
index 21cb8bb6b..ba4198292 100755
--- a/public/perl/liftOverVCF.pl
+++ b/public/perl/liftOverVCF.pl
@@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
# lift over the file
print "Lifting over the vcf...";
-my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
+my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
if ($recordOriginalLocation) {
$cmd .= " -recordOriginalLocation";
}
@@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar
# Filter the VCF for bad records
print "\nFixing/removing bad records...\n";
-$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
+$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out";
system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying.");
# clean up
From f04e51c6c2b74a79644e9473230410a8ba85fe92 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 15:38:56 -0400
Subject: [PATCH 35/49] Adding docs from Andrey since his repo was all screwed
up.
---
.../indels/SomaticIndelDetectorWalker.java | 143 ++++++++++++------
1 file changed, 94 insertions(+), 49 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index e5ad3106d..8bba8eac2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
+
/**
+ * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
+ * but this latter functionality is now superceded by UnifiedGenotyper.
+ *
+ *
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
- * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
- * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
- * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
- * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
- * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
- * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
- * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
+ * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
+ * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
+ * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
+ * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
+ * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
+ * metrics for the post-processing tools to make the final decision). The calls are performed by default
+ * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
+ * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
+ * respectively. Indels are called from the tumor sample and annotated as germline
+ * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
+ * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
+ * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
+ * in tumor sample, it will not be reported.
*
- * If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
- * please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.
+ * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
+ * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
+ * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
+ *
Input
+ *
+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
+ *
+ *
+ * Output
+ *
+ * Indel calls with associated metrics.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T SomaticIndelDetector \
+ * -o indels.vcf \
+ * -verbose indels.txt
+ * -I:normal normal.bam \
+ * -I:tumor tumor.bam
+ *
*
*/
+
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
public class SomaticIndelDetectorWalker extends ReadWalker {
// @Output
// PrintStream out;
- @Output(doc="File to which variants should be written",required=true)
+ @Output(doc="File to write variants (indels) in VCF format",required=true)
protected VCFWriter vcf_writer = null;
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
@@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
@Hidden
@Argument(fullName = "genotype_intervals", shortName = "genotype",
- doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
+ doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
public String genotypeIntervalsFile = null;
@Hidden
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
- doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
- "if the list turns out to be unsorted, it will throw an exception. "+
- "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
- "to sort and keep it in memory (increases memory usage!).")
+ doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
+ "if the list turns out to be unsorted, it will throw an exception. "+
+ "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
+ "to sort and keep it in memory (increases memory usage!).")
protected boolean GENOTYPE_NOT_SORTED = false;
@Hidden
- @Argument(fullName="unpaired", shortName="unpaired",
- doc="Perform unpaired calls (no somatic status detection)", required=false)
+ @Argument(fullName="unpaired", shortName="unpaired",
+ doc="Perform unpaired calls (no somatic status detection)", required=false)
boolean call_unpaired = false;
- boolean call_somatic ;
+ boolean call_somatic ;
- @Argument(fullName="verboseOutput", shortName="verbose",
- doc="Verbose output file in text format", required=false)
- java.io.File verboseOutput = null;
+ @Argument(fullName="verboseOutput", shortName="verbose",
+ doc="Verbose output file in text format", required=false)
+ java.io.File verboseOutput = null;
@Argument(fullName="bedOutput", shortName="bed",
- doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
+ doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
java.io.File bedOutput = null;
- @Argument(fullName="minCoverage", shortName="minCoverage",
- doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
- int minCoverage = 6;
+ @Argument(fullName="minCoverage", shortName="minCoverage",
+ doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
+ "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
+ int minCoverage = 6;
- @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
- doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
- int minNormalCoverage = 4;
+ @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
+ doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
+ "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
+ int minNormalCoverage = 4;
- @Argument(fullName="minFraction", shortName="minFraction",
- doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
- " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
- double minFraction = 0.3;
+ @Argument(fullName="minFraction", shortName="minFraction",
+ doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
+ " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
+ double minFraction = 0.3;
- @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
- doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
- double minConsensusFraction = 0.7;
+ @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
+ doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
+ "all indel observations at the site exceeds this threshold", required=false)
+ double minConsensusFraction = 0.7;
- @Argument(fullName="minIndelCount", shortName="minCnt",
- doc="Minimum count of reads supporting consensus indel required for making the call. "+
- " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
- "(minIndelCount not met) will not pass.", required=false)
- int minIndelCount = 0;
+ @Argument(fullName="minIndelCount", shortName="minCnt",
+ doc="Minimum count of reads supporting consensus indel required for making the call. "+
+ " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
+ "(minIndelCount not met) will not pass.", required=false)
+ int minIndelCount = 0;
- @Argument(fullName="refseq", shortName="refseq",
- doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
- String RefseqFileName = null;
+ @Argument(fullName="refseq", shortName="refseq",
+ doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
+ "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
+ String RefseqFileName = null;
- @Argument(fullName="blacklistedLanes", shortName="BL",
- doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
- "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
- PlatformUnitFilterHelper dummy;
- @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
+//@Argument(fullName="blacklistedLanes", shortName="BL",
+// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
+// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
+//PlatformUnitFilterHelper dummy;
+
+ @Hidden
+ @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
+ required=false) Boolean DEBUG = false;
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
- "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
+ "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
+ "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
+ "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
+ "if window is too small",required=false) int WINDOW_SIZE = 200;
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
+
+
private WindowContext tumor_context;
private WindowContext normal_context;
private int currentContigIndex = -1;
From fe474b77f85f325ed20d6cb6c50dc298d024d03e Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:05:39 -0400
Subject: [PATCH 36/49] Updating docs so printing looks nicer
---
.../gatk/walkers/variantutils/VariantValidationAssessor.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index b98646270..ea8549474 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
- * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
+ * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
*
*
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
From 4ef6a4598c3704fd5aac5f5302a148ddfedd3958 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:10:34 -0400
Subject: [PATCH 37/49] Updating docs to include output
---
.../walkers/varianteval/VariantEvalWalker.java | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 266b97af0..28f4f2a56 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -56,6 +56,22 @@ import java.util.*;
* Output
*
* Evaluation tables detailing the results of the eval modules which were applied.
+ * For example:
+ *
+ * output.eval.gatkreport:
+ * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
+ * CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
+ * CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
+ * CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
+ * CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
+ * CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
+ * CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
+ * CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
+ * CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
+ * CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
+ * CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
+ * ...
+ *
*
*
* Examples
From 6d02a34bfba1537f294f5a077b24702e539b87a5 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:17:54 -0400
Subject: [PATCH 38/49] Updating docs to include output
---
.../variantutils/VariantValidationAssessor.java | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index ea8549474..8eaf976d0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -57,7 +57,16 @@ import java.util.*;
*
* Output
*
- * An annotated VCF.
+ * An annotated VCF. Additionally, a table like the following will be output:
+ *
+ * Total number of samples assayed: 185
+ * Total number of records processed: 152
+ * Number of Hardy-Weinberg violations: 34 (22%)
+ * Number of no-call violations: 12 (7%)
+ * Number of homozygous variant violations: 0 (0%)
+ * Number of records passing all filters: 106 (69%)
+ * Number of passing records that are polymorphic: 98 (92%)
+ *
*
*
* Examples
From fd1831b4a520e68b15b6b5b958aa2d04ade4e287 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:25:03 -0400
Subject: [PATCH 39/49] Updating docs to include more details
---
.../gatk/walkers/fasta/FastaAlternateReferenceWalker.java | 6 ++++--
.../sting/gatk/walkers/fasta/FastaReferenceWalker.java | 3 +++
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index fd912334f..4e2c17bf6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -43,8 +43,10 @@ import java.util.List;
* Generates an alternative reference sequence over the specified interval.
*
*
- * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
- * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
+ * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
+ * Note that if there are multiple variants at a site, it takes the first one seen.
+ * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
*
*
Input
*
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
index 5f3b37cc8..7ae5c5c75 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
@@ -42,6 +42,9 @@ import java.io.PrintStream;
*
*
* The output format can be partially controlled using the provided command-line arguments.
+ * Specify intervals with the usual -L argument to output only the reference bases within your intervals.
+ * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
+ * separate fasta sequence (named numerically in order).
*
*
Input
*
From 2f58fdb369a3cd4857281dd210427fac6352ca88 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Thu, 15 Sep 2011 16:26:11 -0400
Subject: [PATCH 40/49] Adding expected output doc to CountCovariates
---
.../recalibration/CountCovariatesWalker.java | 36 +++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
index 98c8950e3..1bdb70bdd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@@ -76,6 +76,42 @@ import java.util.Map;
* Output
*
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
+ * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
+ *
+ * The first 20 lines of such a file is shown below.
+ * * The file begins with a series of comment lines describing:
+ * ** The number of counted loci
+ * ** The number of counted bases
+ * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
+ *
+ * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
+ *
+ * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
+ * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
+ * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ *
+ *
+ * # Counted Sites 19451059
+ * # Counted Bases 56582018
+ * # Skipped Sites 82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ *
*
*
* Examples
From 9dc6354130b23683c31a7b2c1ef8c2ed94da1946 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:55:24 -0400
Subject: [PATCH 41/49] Oops didn't mean to touch this test before
---
.../gatk/walkers/varianteval/VariantEvalIntegrationTest.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index d8f7ad3b6..99622cbf6 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -42,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
- "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf",
+ "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf",
"-noEV",
"-EV TiTvVariantEvaluator",
"-ST Sample",
From d78e00e5b2cd5e8a1b1aa75209100b039e521442 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Thu, 15 Sep 2011 16:09:07 -0400
Subject: [PATCH 42/49] Renaming VariantAnnotator SnpEff keys
This is to head off potential confusion with the output from the SnpEff tool itself,
which also uses a key named EFF.
---
.../sting/gatk/walkers/annotator/SnpEff.java | 90 ++++++++++---------
.../stratifications/FunctionalClass.java | 4 +-
.../VariantAnnotatorIntegrationTest.java | 2 +-
.../VariantEvalIntegrationTest.java | 2 +-
4 files changed, 53 insertions(+), 45 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index bb3685fb5..4ead77506 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -68,23 +68,31 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
// Key names for the INFO field annotations we will add to each record, along
// with parsing-related information:
public enum InfoFieldKey {
- EFF (-1),
- EFF_IMPACT (0),
- EFF_CODON_CHANGE (1),
- EFF_AMINO_ACID_CHANGE (2),
- EFF_GENE_NAME (3),
- EFF_GENE_BIOTYPE (4),
- EFF_TRANSCRIPT_ID (6),
- EFF_EXON_ID (7);
+ EFFECT_KEY ("SNPEFF_EFFECT", -1),
+ IMPACT_KEY ("SNPEFF_IMPACT", 0),
+ CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
+ AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
+ GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
+ GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
+ TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
+ EXON_ID_KEY ("SNPEFF_EXON_ID", 7);
+
+ // Actual text of the key
+ private final String keyName;
// Index within the effect metadata subfields from the SnpEff EFF annotation
// where each key's associated value can be found during parsing.
private final int fieldIndex;
- InfoFieldKey ( int fieldIndex ) {
+ InfoFieldKey ( String keyName, int fieldIndex ) {
+ this.keyName = keyName;
this.fieldIndex = fieldIndex;
}
+ public String getKeyName() {
+ return keyName;
+ }
+
public int getFieldIndex() {
return fieldIndex;
}
@@ -292,27 +300,27 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
}
public List getKeyNames() {
- return Arrays.asList( InfoFieldKey.EFF.toString(),
- InfoFieldKey.EFF_IMPACT.toString(),
- InfoFieldKey.EFF_CODON_CHANGE.toString(),
- InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(),
- InfoFieldKey.EFF_GENE_NAME.toString(),
- InfoFieldKey.EFF_GENE_BIOTYPE.toString(),
- InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),
- InfoFieldKey.EFF_EXON_ID.toString()
+ return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
+ InfoFieldKey.IMPACT_KEY.getKeyName(),
+ InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
+ InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
+ InfoFieldKey.GENE_NAME_KEY.getKeyName(),
+ InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
+ InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
+ InfoFieldKey.EXON_ID_KEY.getKeyName()
);
}
public List getDescriptions() {
return Arrays.asList(
- new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
+ new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+ new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+ new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
@@ -375,16 +383,16 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
}
try {
- impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]);
+ impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
- parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]));
+ parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
}
- codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()];
- aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()];
- geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()];
- geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()];
+ codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
+ aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
+ geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
+ geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
@@ -398,8 +406,8 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
coding = EffectCoding.UNKNOWN;
}
- transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()];
- exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()];
+ transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
+ exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
}
private void parseError ( String message ) {
@@ -443,14 +451,14 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
public Map getAnnotations() {
Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length));
- addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString());
- addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString());
- addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange);
- addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange);
- addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName);
- addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype);
- addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID);
- addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID);
+ addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
+ addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
+ addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
+ addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
+ addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
+ addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
+ addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
+ addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
return annotations;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index a32857ffc..88ffcaaeb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -62,8 +62,8 @@ public class FunctionalClass extends VariantStratifier {
annotationId++;
} while (eval.hasAttribute(key));
- } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFF.name() ) ) {
- SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFF.name()).toString());
+ } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) {
+ SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString());
if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
type = FunctionalType.nonsense;
else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index f902ce276..08baae7a7 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
"snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000",
1,
- Arrays.asList("a1c3ba9efc28ee0606339604095076ea")
+ Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9")
);
executeTest("Testing SnpEff annotations", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index 99622cbf6..b90e6d0ff 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -32,7 +32,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
1,
Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12")
);
- executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec);
+ executeTest("testFunctionClassWithSnpeff", spec);
}
@Test
From e6e9b08c9a47640f9be32b47f495174118636a5c Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Thu, 15 Sep 2011 18:51:09 -0400
Subject: [PATCH 44/49] Must provide alleles VCF to UGCallVariants
---
.../sting/gatk/walkers/genotyper/UGCallVariants.java | 1 -
1 file changed, 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java
index 500b11360..d88e55687 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java
@@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
From 9fdf1f8eb663858cacafd8fb339d098cdce4b96d Mon Sep 17 00:00:00 2001
From: Christopher Hartl
Date: Thu, 15 Sep 2011 21:05:22 -0400
Subject: [PATCH 45/49] Fix some doc formatting for Depth of Coverage
---
.../gatk/walkers/coverage/DepthOfCoverageWalker.java | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
index 3a18fe610..86f97a36c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
@@ -69,14 +69,23 @@ import java.util.*;
* Output
*
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
+ *
* - no suffix: per locus coverage
+ *
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
+ *
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
+ *
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
+ *
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
+ *
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
+ *
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
+ *
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
+ *
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
*
*
From 939babc820cc5174a1d97a8b6bdb992ca6cedc09 Mon Sep 17 00:00:00 2001
From: Christopher Hartl
Date: Thu, 15 Sep 2011 21:05:51 -0400
Subject: [PATCH 46/49] Updating formating for ValidationAmplicons GATK docs
---
.../sting/gatk/walkers/validation/ValidationAmplicons.java | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index 01e8cd321..48cba6a1a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -61,7 +61,7 @@ import java.util.List;
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
*
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
- *
+ *
* Valid // amplicon is valid
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@@ -72,10 +72,10 @@ import java.util.List;
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
- *
+ *
*
* * java * -jar GenomeAnalysisTK.jar * -T ValidationAmplicons From 33967a4e0c09e85cc4dc1d0eb83fe6feef80c46d Mon Sep 17 00:00:00 2001 From: Khalid ShakirDate: Fri, 16 Sep 2011 12:46:07 -0400 Subject: [PATCH 48/49] Fixed issue reported by chartl where cloned functions lost tags on @Inputs. Updated ExampleUnifiedGenotyper.scala with new syntax. --- .../examples/ExampleUnifiedGenotyper.scala | 6 +-- .../sting/queue/extensions/gatk/RodBind.scala | 2 +- .../queue/extensions/gatk/TaggedFile.scala | 2 +- .../sting/queue/function/QFunction.scala | 16 +------- .../{function => util}/FileExtension.scala | 2 +- .../sting/queue/util/IOUtils.scala | 40 ++++++++++++++----- 6 files changed, 36 insertions(+), 32 deletions(-) rename public/scala/src/org/broadinstitute/sting/queue/{function => util}/FileExtension.scala (89%) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 1d473b210..9bddfd97c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -56,15 +56,15 @@ class ExampleUnifiedGenotyper extends QScript { genotyper.input_file :+= qscript.bamFile genotyper.out = swapExt(qscript.bamFile, "bam", "unfiltered.vcf") - evalUnfiltered.rodBind :+= RodBind("eval", "VCF", genotyper.out) + evalUnfiltered.eval :+= genotyper.out evalUnfiltered.out = swapExt(genotyper.out, "vcf", "eval") - variantFilter.rodBind :+= RodBind("variant", "VCF", genotyper.out) + variantFilter.variant = genotyper.out variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf") variantFilter.filterName = filterNames variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"") - evalFiltered.rodBind :+= RodBind("eval", "VCF", variantFilter.out) + evalFiltered.eval :+= variantFilter.out evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval") add(genotyper, evalUnfiltered) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala index 42f63e225..b4c5d91d3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala @@ -1,7 +1,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File -import org.broadinstitute.sting.queue.function.FileExtension +import org.broadinstitute.sting.queue.util.FileExtension import java.lang.String /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala index ed8158b49..b19f9e430 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala @@ -1,7 +1,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File -import org.broadinstitute.sting.queue.function.FileExtension +import org.broadinstitute.sting.queue.util.FileExtension /** * Used to provide tagged -I input_file arguments to the GATK. diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index c905581fa..500f7b200 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -387,25 +387,11 @@ trait QFunction extends Logging with QJobReport { */ protected def canon(value: Any) = { value match { - case fileExtension: FileExtension => - val newFile = absolute(fileExtension); - val newFileExtension = fileExtension.withPath(newFile.getPath) - newFileExtension - case file: File => - if (file.getClass != classOf[File]) - throw new QException("Extensions of file must also extend with FileExtension so that the path can be modified."); - absolute(file) + case file: File => IOUtils.absolute(commandDirectory, file) case x => x } } - /** - * Returns the absolute path to the file relative to the run directory and the job command directory. - * @param file File to root relative to the command directory if it is not already absolute. - * @return The absolute path to file. - */ - private def absolute(file: File) = IOUtils.absolute(commandDirectory, file) - /** * Scala sugar type for checking annotation required and exclusiveOf. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala similarity index 89% rename from public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala index e2394a5bf..9b6e52c8e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.queue.function +package org.broadinstitute.sting.queue.util import java.io.File diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala index 79ffa8cb9..b17ccc0d5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.util import org.apache.commons.io.FileUtils import java.io.{FileReader, File} import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.queue.QException /** * A collection of utilities for modifying java.io. @@ -12,7 +13,7 @@ object IOUtils extends Logging { * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly. * @param tempDir Temporary directory. */ - def checkTempDir(tempDir: File) = { + def checkTempDir(tempDir: File) { val tempDirPath = tempDir.getAbsolutePath // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses // in the path which can cause problems with the Google Reflections library. @@ -20,7 +21,7 @@ object IOUtils extends Logging { if (tempDirPath.startsWith("/var/folders/") || (tempDirPath == "/tmp") || (tempDirPath == "/tmp/")) throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set") if (!tempDir.exists && !tempDir.mkdirs) - throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath) } /** @@ -35,9 +36,9 @@ object IOUtils extends Logging { throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent) val temp = File.createTempFile(prefix + "-", suffix, tempDirParent) if (!temp.delete) - throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath) if (!temp.mkdir) - throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath) absolute(temp) } @@ -46,7 +47,7 @@ object IOUtils extends Logging { * @param file File to write to. * @param content Content to write. */ - def writeContents(file: File, content: String) = FileUtils.writeStringToFile(file, content) + def writeContents(file: File, content: String) { FileUtils.writeStringToFile(file, content) } /** * Reads content of a file into a string. @@ -146,10 +147,12 @@ object IOUtils extends Logging { * @return The absolute path to the file in the parent dir if the path was not absolute, otherwise the original path. */ def absolute(parent: File, file: File): File = { - if (file.isAbsolute) - absolute(file) - else - absolute(new File(parent, file.getPath)) + val newPath = + if (file.isAbsolute) + absolutePath(file) + else + absolutePath(new File(parent, file.getPath)) + replacePath(file, newPath) } /** @@ -159,12 +162,16 @@ object IOUtils extends Logging { * @return the absolute path to the file. */ def absolute(file: File) = { + replacePath(file, absolutePath(file)) + } + + private def absolutePath(file: File) = { var fileAbs = file.getAbsoluteFile var names = List.empty[String] while (fileAbs != null) { val name = fileAbs.getName fileAbs = fileAbs.getParentFile - + if (name == ".") { /* skip */ @@ -190,7 +197,18 @@ object IOUtils extends Logging { } } - new File(names.mkString("/", "/", "")) + names.mkString("/", "/", "") + } + + private def replacePath(file: File, path: String) = { + file match { + case fileExtension: FileExtension => + fileExtension.withPath(path) + case file: File => + if (file.getClass != classOf[File]) + throw new QException("Sub classes of java.io.File must also implement FileExtension so that the path can be modified.") + new File(path) + } } /** From 7fa1e237d9e1e41fbdcd42f069df7c658f523bc7 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 16 Sep 2011 12:53:54 -0400 Subject: [PATCH 49/49] Forgot to git stash pop new MD5's for CombineVariants integration test --- .../walkers/variantutils/CombineVariantsIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 3267173a7..35495d797 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -90,7 +90,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a"); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "312a22aedb088b678bc891f1a1b03c91"); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); } @@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("1de95f91ca15d2a8856de35dee0ce33e")); + Arrays.asList("35acb0f15f9cd18c653ede4e15e365c9")); executeTest("threeWayWithRefs", spec); }