diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java index 64f87fe5f..f815d8542 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -51,20 +51,25 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; import java.util.Arrays; public class ValidateVariantsIntegrationTest extends WalkerTest { - protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e"; - protected static final String defaultRegion = "1:10001292-10001303"; + protected static final String EMPTY_MD5 = "d41d8cd98f00b204e9800998ecf8427e"; + protected static final String DEFAULT_REGION = "1:10001292-10001303"; public static String baseTestString(final String file, String type) { - return baseTestString(file,type,defaultRegion,b36KGReference); + return baseTestString(file,type,DEFAULT_REGION,b36KGReference); } public static String baseTestString(String file, String type, String region, String reference) { @@ -88,7 +93,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleGood.vcf", "ALL"), 0, - Arrays.asList(emptyMd5) + Arrays.asList(EMPTY_MD5) ); executeTest("test good file", spec); @@ -175,7 +180,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "-ALL"), 0, - Arrays.asList(emptyMd5) + Arrays.asList(EMPTY_MD5) ); executeTest("test no validation", spec); @@ -186,7 +191,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("complexEvents.vcf", "ALL"), 0, - Arrays.asList(emptyMd5) + Arrays.asList(EMPTY_MD5) ); executeTest("test validating complex events", spec); @@ -195,7 +200,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") public void testUnusedAlleleFix() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(emptyMd5)); + baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(EMPTY_MD5)); executeTest("test unused allele bug fix", spec); } @@ -205,4 +210,49 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class); executeTest("test unused allele bug fix", spec); } + + @Test(description = "Checks '''bug''' reported in issue https://github.com/broadinstitute/gsa-unstable/issues/963") + public void testLargeReferenceAlleleError() throws IOException { + // Need to see log INFO messages + Level level = logger.getLevel(); + logger.setLevel(Level.INFO); + + File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp"); + String logFileName = logFile.getAbsolutePath(); + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("longAlleles.vcf", "ALL", "1", b37KGReference) + " -log " + logFileName, + 0, Arrays.asList(EMPTY_MD5)); + executeTest("test long reference allele bug error", spec); + + // Make sure the "reference allele too long" message is in the log + Assert.assertTrue(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG)); + + // Set the log level back + logger.setLevel(level); + } + + @Test(description = "Checks '''bug''' is fixed, reported in issue https://github.com/broadinstitute/gsa-unstable/issues/963") + public void testLargeReferenceAlleleFix() throws IOException { + // Need to see log INFO messages + Level level = logger.getLevel(); + logger.setLevel(Level.INFO); + + File logFile = createTempFile("testLargeReferenceAllele.log", ".tmp"); + String logFileName = logFile.getAbsolutePath(); + + // expand window for the large reference allele + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("longAlleles.vcf","ALL","1",b37KGReference) + " --reference_window_stop 208 -log " + logFileName, + 0, Arrays.asList(EMPTY_MD5)); + executeTest("test long reference allele bug fix", spec); + + // Make sure the "reference allele too long" message is not in the log + Assert.assertFalse(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG)); + + // All of the validation tests have passed since UserException.FailsStrictValidation is not thrown. + + // Set the log level back + logger.setLevel(level); + } } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index ddc53c0db..20b55f3ff 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -50,6 +50,9 @@ import java.util.concurrent.TimeUnit; */ public class GATKArgumentCollection { + // the default value of the stop of the expanded window + public static final int DEFAULT_REFERENCE_WINDOW_STOP = 0; + /** the constructor */ public GATKArgumentCollection() { } @@ -663,5 +666,17 @@ public class GATKArgumentCollection { @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) @Advanced public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; + + // -------------------------------------------------------------------------------------------------------------- + // + // Window arguments + // + // ------------------------------------------------------------------------------------------------------------- + /** + * Stop of the expanded window for which the reference context should be provided, relative to the locus. + */ + @Argument(fullName = "reference_window_stop", shortName = "ref_win_stop", doc = "Reference window stop", minValue = 0, required = false) + @Advanced + public int reference_window_stop = DEFAULT_REFERENCE_WINDOW_STOP; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java index a39ceeae1..ede909e92 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; import htsjdk.samtools.reference.ReferenceSequence; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.Walker; @@ -74,7 +75,7 @@ public class LocusReferenceView extends ReferenceView { /** - * Start of the expanded window for which the reference context should be provided, + * Stop of the expanded window for which the reference context should be provided, * relative to the locus in question. */ private final int windowStop; @@ -99,10 +100,11 @@ public class LocusReferenceView extends ReferenceView { /** * Create a new locus reference view. + * @param walker input walker * @param provider source for locus data. */ public LocusReferenceView( Walker walker, LocusShardDataProvider provider ) { - super( provider ); + super(provider); initializeBounds(provider); // Retrieve information about the window being accessed. @@ -113,11 +115,22 @@ public class LocusReferenceView extends ReferenceView { if( window.stop() < 0 ) throw new ReviewedGATKException( "Reference window ends before current locus" ); windowStart = window.start(); - windowStop = window.stop(); + + if ( walker.getArguments() == null ){ + windowStop = window.stop(); + } else { + // Use reference arguments if set, otherwise use the annotation + windowStop = walker.getArguments().reference_window_stop != GATKArgumentCollection.DEFAULT_REFERENCE_WINDOW_STOP ? + walker.getArguments().reference_window_stop : window.stop(); + } } else { windowStart = 0; - windowStop = 0; + if ( walker.getArguments() == null ){ + windowStop = 0; + } else { + windowStop = walker.getArguments().reference_window_stop; + } } if(bounds != null) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java index c21289722..9d08422eb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java @@ -40,6 +40,7 @@ import org.broadinstitute.gatk.utils.baq.BAQ; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.engine.recalibration.BQSRMode; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import java.util.List; @@ -85,18 +86,49 @@ public abstract class Walker { /** * Gets the master sequence dictionary for this walker * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return + * @return the master sequence dictionary or null if no genome analysis toolkit. */ protected SAMSequenceDictionary getMasterSequenceDictionary() { - return getToolkit().getMasterSequenceDictionary(); + if ( toolkit == null ) + return null; + else + return toolkit.getMasterSequenceDictionary(); } + /** + * Gets the GATK argument collection + * @link GenomeAnalysisEngine.getArguments + * @return the GATK argument collection or null if no genome analysis toolkit. + */ + public GATKArgumentCollection getArguments(){ + if ( toolkit == null ) + return null; + else + return toolkit.getArguments(); + } + + /** + * Gets the GATK samples database + * @link GenomeAnalysisEngine.getSampleDB + * @return the GATK samples database or null if no genome analysis toolkit. + */ public SampleDB getSampleDB() { - return getToolkit().getSampleDB(); + if ( toolkit == null ) + return null; + else + return toolkit.getSampleDB(); } + /** + * Gets a sample from the GATK samples database + * @param id the sample ID + * @return the sample from the GATK samples database or null if no genome analysis toolkit or samples database. + */ protected Sample getSample(final String id) { - return getToolkit().getSampleDB().getSample(id); + if ( getSampleDB() == null ) + return null; + else + return getSampleDB().getSample(id); } /** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index 751b1cd1f..1bfbf753f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -91,6 +91,16 @@ import java.util.*; * --dbsnp dbsnp.vcf * * + *

To perform VCF format tests and all strict validations with the VCFs containing alleles > 208 bases

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T ValidateVariants \
+ *   -R reference.fasta \
+ *   -V input.vcf \
+ *   --dbsnp dbsnp.vcf
+ *   --reference_window_stop 208
+ * 
+ * *

To perform only VCF format tests

*
  * java -jar GenomeAnalysisTK.jar \
@@ -114,6 +124,9 @@ import java.util.*;
 @Reference(window=@Window(start=0,stop=100))
 public class ValidateVariants extends RodWalker {
 
+    // Log message for a reference allele that is too long
+    protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long";
+
     @ArgumentCollection
     protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
 
@@ -181,6 +194,9 @@ public class ValidateVariants extends RodWalker {
 
     private File file = null;
 
+    // Stop of the expanded window for which the reference context should be provided, relative to the locus.
+    private int referenceWindowStop;
+
     /**
      * Contains final set of validation to apply.
      */
@@ -189,6 +205,7 @@ public class ValidateVariants extends RodWalker {
     public void initialize() {
         file = new File(variantCollection.variants.getSource());
         validationTypes = calculateValidationTypesToApply(excludeTypes);
+        referenceWindowStop = getToolkit().getArguments().reference_window_stop;
     }
 
     public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@@ -220,8 +237,11 @@ public class ValidateVariants extends RodWalker {
         // get the true reference allele
         final Allele reportedRefAllele = vc.getReference();
         final int refLength = reportedRefAllele.length();
-        if ( refLength > 100 ) {
-            logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
+
+        // reference length is greater than the reference window stop before and after expansion
+        if ( refLength > 100 && refLength > referenceWindowStop ) {
+            logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d",
+                    REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength));
             return;
         }
 
@@ -259,7 +279,7 @@ public class ValidateVariants extends RodWalker {
      * @return never {@code null} but perhaps an empty set.
      */
     private Collection calculateValidationTypesToApply(final List excludeTypes) {
-        if (excludeTypes.size() == 0)
+        if (excludeTypes.isEmpty())
             return Collections.singleton(ValidationType.ALL);
         final Set excludeTypeSet = new LinkedHashSet<>(excludeTypes);
         if (excludeTypes.size() != excludeTypeSet.size())