diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java index a0382df91..de2d7ae30 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java @@ -120,7 +120,7 @@ public class BQSRIntegrationTest extends WalkerTest { {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")}, + {new BQSRTest(hg18Reference, privateTestDir + "originalQuals.1kg.chr1.1-1K.1RG.bam", "chr1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java index f815d8542..7d2f2d0c6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -189,7 +189,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testComplexEvents() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("complexEvents.vcf", "ALL"), + baseTestString("complexEvents.vcf", "ALL", DEFAULT_REGION, b37KGReference), 0, Arrays.asList(EMPTY_MD5) ); @@ -197,6 +197,15 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { executeTest("test validating complex events", spec); } + @Test(description = "Checks out of order header contigs") + public void testOutOfOrderHeaderContigsError() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("complexEvents-outOfOrder.vcf", "ALL", DEFAULT_REGION, b37KGReference), + 0, UserException.LexicographicallySortedSequenceDictionary.class); + executeTest("test out of order header contigs error", spec); + } + @Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") public void testUnusedAlleleFix() { WalkerTestSpec spec = new WalkerTestSpec( @@ -255,4 +264,31 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { // Set the log level back logger.setLevel(level); } + + @Test(description = "Checks '''issue''' reported in issue https://github.com/broadinstitute/gsa-unstable/issues/964") + public void testWrongContigHeaderLengthError() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference), + 0, UserException.IncompatibleSequenceDictionaries.class); + executeTest("test wrong header contig length error", spec); + } + + @Test + public void testAllowWrongContigHeaderLengthDictIncompat() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ", + 0, Arrays.asList(EMPTY_MD5)); + executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec); + } + + @Test + public void testAllowWrongContigHeaderLength() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ", + 0, Arrays.asList(EMPTY_MD5)); + executeTest("test to allow wrong header contig length, no compatibility checks", spec); + } } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 86f020394..be2bf610a 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -886,8 +886,8 @@ public class GenomeAnalysisEngine { // Compile a set of sequence names that exist in the BAM files. SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); + if (readsDictionary.isEmpty()) { + logger.info("Reads file is unmapped. Skipping validation against reference."); return; } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java index f1e832d2c..b8f4fffc6 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java @@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest { @DataProvider(name="cramData") public Object[][] getCRAMData() { return new Object[][] { - {"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, - {"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, - {"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"}, - {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"}, + {"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"}, + {"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"}, + {"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"}, {"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, {"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, - {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, - {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, + {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"}, {"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, }; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java index 9e3184f12..2d2ce2bd3 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java @@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest { public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s", publicTestDir + "exampleFASTA.fasta", publicTestDir + "exampleBAM.bam"); - public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425"; + public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8"; private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java index 870277d1d..9f21233aa 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java @@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest { @Test public void testMissingReadGroup() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", 0, UserException.ReadMissingReadGroup.class); executeTest("test Missing Read Group", spec); @@ -44,7 +44,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest { @Test public void testUndefinedReadGroup() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", 0, UserException.ReadHasUndefinedReadGroup.class); executeTest("test Undefined Read Group", spec); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java index 160abd38d..71d87409f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.utils; +import java.math.BigInteger; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -79,7 +80,7 @@ public class SequenceDictionaryUtils { IDENTICAL, // the dictionaries are identical COMMON_SUBSET, // there exists a common subset of equivalent contigs NO_COMMON_CONTIGS, // no overlap between dictionaries - UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different // orders with respect to each other @@ -92,7 +93,7 @@ public class SequenceDictionaryUtils { * @param validationExclusion exclusions to validation * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency */ - private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { + private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) { return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || validationExclusion == ValidationExclusion.TYPE.ALL ); } @@ -133,15 +134,23 @@ public class SequenceDictionaryUtils { throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); case UNEQUAL_COMMON_CONTIGS: { - List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - SAMSequenceRecord elt1 = x.get(0); - SAMSequenceRecord elt2 = x.get(1); + final List x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final SAMSequenceRecord elt1 = x.get(0); + final SAMSequenceRecord elt2 = x.get(1); + + String msg = "Found contigs with the same name but different lengths"; + String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength()); + if ( elt1.getMd5() != null ) + contig1 += " and MD5 " + elt1.getMd5(); + String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength()); + if ( elt2.getMd5() != null ) + contig2 += " and MD5 " + elt2.getMd5(); + if ( elt1.getMd5() != null || elt2.getMd5() != null ) + msg += " or MD5s:"; + msg += "\n" + contig1 + "\n" + contig2; // todo -- replace with toString when SAMSequenceRecord has a nice toString routine - UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", - name1, elt1.getSequenceName(), elt1.getSequenceLength(), - name2, elt2.getSequenceName(), elt2.getSequenceLength()), - name1, dict1, name2, dict2); + final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); @@ -226,7 +235,7 @@ public class SequenceDictionaryUtils { final Set commonContigs = getCommonContigsByName(dict1, dict2); - if (commonContigs.size() == 0) + if (commonContigs.isEmpty()) return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; @@ -250,12 +259,12 @@ public class SequenceDictionaryUtils { * @param dict2 * @return true if all of the common contigs are equivalent */ - private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + private static boolean commonContigsHaveSameLengths(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { + return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null; } /** - * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns * null if all common contigs are equivalent * * @param commonContigs @@ -263,7 +272,7 @@ public class SequenceDictionaryUtils { * @param dict2 * @return */ - private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + private static List findNotEqualCommonContigs(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { for ( String name : commonContigs ) { SAMSequenceRecord elt1 = dict1.getSequence(name); SAMSequenceRecord elt2 = dict2.getSequence(name); @@ -275,32 +284,32 @@ public class SequenceDictionaryUtils { } /** - * Helper routine that returns two sequence records are equivalent, defined as having the same name and - * lengths, if both are non-zero + * Helper routine that determines if two sequence records are equivalent, defined as having the same name, + * lengths (if both are non-zero) and MD5 (if present) * - * @param me - * @param that - * @return + * @param record1 a SAMSequenceRecord + * @param record2 a SAMSequenceRecord + * @return true if the records are equivalent, false otherwise */ - private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { - if (me == that) return true; - if (that == null) return false; + private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) { + if ( record1 == record2 ) return true; + if ( record1 == null || record2 == null ) return false; - if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) + // compare length + if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() ) return false; - // todo -- reenable if we want to be really strict here -// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { -// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// if (!thisMd5.equals(thatMd5)) { -// return false; -// } -// } -// else { - if (me.getSequenceName() != that.getSequenceName()) - return false; // Compare using == since we intern() the Strings -// } + // compare name + if ( !record1.getSequenceName().equals(record2.getSequenceName() )) + return false; + + // compare MD5 + if ( record1.getMd5() != null && record2.getMd5() != null ){ + final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16); + final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16); + if ( !firstMd5.equals(secondMd5) ) + return false; + } return true; } @@ -313,13 +322,13 @@ public class SequenceDictionaryUtils { * @param dict * @return */ - private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) { if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false return false; SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; - for ( SAMSequenceRecord elt : dict.getSequences() ) { + for ( final SAMSequenceRecord elt : dict.getSequences() ) { if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; @@ -355,7 +364,7 @@ public class SequenceDictionaryUtils { * @param dict2 second SAMSequenceDictionary * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false */ - private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + private static boolean commonContigsAreInSameRelativeOrder(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { List list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); List list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); @@ -376,8 +385,8 @@ public class SequenceDictionaryUtils { * @param dict * @return */ - private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { - List l = new ArrayList(commonContigs.size()); + private static List getSequencesOfName(final Set commonContigs, final SAMSequenceDictionary dict) { + final List l = new ArrayList(commonContigs.size()); for ( String name : commonContigs ) { l.add(dict.getSequence(name) ); } @@ -401,7 +410,7 @@ public class SequenceDictionaryUtils { * @param unsorted * @return */ - private static List sortSequenceListByIndex(List unsorted) { + private static List sortSequenceListByIndex(final List unsorted) { Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); return unsorted; } @@ -418,8 +427,8 @@ public class SequenceDictionaryUtils { */ private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { for ( String commonContig : commonContigs ) { - SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); - SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); // Each common contig must have the same index in both dictionaries if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { @@ -489,13 +498,13 @@ public class SequenceDictionaryUtils { * @return */ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - Set intersectingSequenceNames = getContigNames(dict1); + final Set intersectingSequenceNames = getContigNames(dict1); intersectingSequenceNames.retainAll(getContigNames(dict2)); return intersectingSequenceNames; } public static Set getContigNames(SAMSequenceDictionary dict) { - Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); + final Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames; @@ -515,7 +524,7 @@ public class SequenceDictionaryUtils { throw new IllegalArgumentException("Sequence dictionary must be non-null"); } - StringBuilder s = new StringBuilder("[ "); + final StringBuilder s = new StringBuilder("[ "); for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { s.append(dictionaryEntry.getSequenceName()); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java index 08d84b084..c09ebe4cc 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java @@ -40,7 +40,7 @@ public class ValidationExclusion { ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file - ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities + ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL ALL // do not check for all of the above conditions, DEFAULT diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java index 42cf87433..30bd8ec6f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java @@ -102,7 +102,7 @@ public class IndexDictionaryUtils { final SAMSequenceDictionary referenceDict, final ValidationExclusion.TYPE validationExclusionType ) { // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) + if (trackDict == null || trackDict.isEmpty()) logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); else { Set trackSequences = new TreeSet(); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java index 7863b022d..76f2046af 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java @@ -93,7 +93,7 @@ public class RMDTrack { * @param dict the sam sequence dictionary * @param codec the feature codec we use to decode this type */ - public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { + public RMDTrack(final Class type, final String name, final File file, final AbstractFeatureReader reader, final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, final FeatureCodec codec) { this.type = type; this.name = name; this.file = file; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java index 34be252a7..0b84c05a2 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java @@ -26,6 +26,9 @@ package org.broadinstitute.gatk.utils.refdata.tracks; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFHeader; import org.apache.log4j.Logger; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.FeatureCodec; @@ -34,6 +37,7 @@ import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.Index; import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.gatk.utils.SequenceDictionaryUtils; import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -49,6 +53,8 @@ import org.broadinstitute.gatk.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Map; @@ -131,7 +137,7 @@ public class RMDTrackBuilder { // extends PluginManager { * * @return an instance of the track */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + public RMDTrack createInstanceOfTrack(final RMDTriplet fileDescriptor) { String name = fileDescriptor.getName(); File inputFile = new File(fileDescriptor.getFile()); @@ -146,9 +152,43 @@ public class RMDTrackBuilder { // extends PluginManager { else pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + + validateVariantAgainstSequenceDictionary(name, descriptor.getName(), pair.first, pair.second); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); } + /** + * Validate the VCF dictionary against the sequence dictionary. + * + * @param name the name of this specific track + * @param descriptorName the name of the feature + * @param reader the feature reader to use as the underlying data source + * @param dict the sam sequence dictionary + */ + private void validateVariantAgainstSequenceDictionary(final String name, final String descriptorName, final AbstractFeatureReader reader, final SAMSequenceDictionary dict ) throws UserException { + // only process if the variant is a VCF + if ( name.equals("variant") && descriptorName.equals("VCF") ){ + if ( reader != null && dict != null && reader.getHeader() != null ){ + final List contigs = ((VCFHeader) reader.getHeader()).getContigLines(); + if (contigs != null) { + // make the VCF dictionary from the contig header fields + final List vcfContigRecords = new ArrayList(); + for (final VCFContigHeaderLine contig : contigs) + vcfContigRecords.add(contig.getSAMSequenceRecord()); + + // have VCF contig fields so can make a dictionary and compare it to the sequence dictionary + if (!vcfContigRecords.isEmpty()) { + final SAMSequenceDictionary vcfDictionary = new SAMSequenceDictionary(vcfContigRecords); + final SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(dict.getSequences()); + + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, name, vcfDictionary, "sequence", sequenceDictionary, false, null); + } + } + } + } + } + /** * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. * @param codecClass Type of Tribble codec class to build. @@ -228,7 +268,7 @@ public class RMDTrackBuilder { // extends PluginManager { sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { + if (sequenceDictionary.isEmpty() && dict != null) { validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); if ( ! disableAutoIndexCreation ) { diff --git a/public/gatk-utils/src/test/resources/exampleBAM.bam b/public/gatk-utils/src/test/resources/exampleBAM.bam index 319dd1a72..d1c9d8001 100644 Binary files a/public/gatk-utils/src/test/resources/exampleBAM.bam and b/public/gatk-utils/src/test/resources/exampleBAM.bam differ diff --git a/public/gatk-utils/src/test/resources/exampleBAM.bam.bai b/public/gatk-utils/src/test/resources/exampleBAM.bam.bai index 052ac614b..33beef3d1 100644 Binary files a/public/gatk-utils/src/test/resources/exampleBAM.bam.bai and b/public/gatk-utils/src/test/resources/exampleBAM.bam.bai differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram index 334090031..d4db8b3dc 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram index 334090031..d4db8b3dc 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai index c5492846c..c3d728b4c 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM.cram b/public/gatk-utils/src/test/resources/exampleCRAM.cram index 334090031..78d606d03 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM.cram and b/public/gatk-utils/src/test/resources/exampleCRAM.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai b/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai new file mode 100644 index 000000000..3eee8e007 Binary files /dev/null and b/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai differ