Merge pull request #1223 from broadinstitute/rhl_dict_check

Validate VCF with sequence dictionary
This commit is contained in:
Ron Levine 2015-11-20 12:43:44 -05:00
commit dc0e32e3aa
18 changed files with 149 additions and 64 deletions

View File

@ -120,7 +120,7 @@ public class BQSRIntegrationTest extends WalkerTest {
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")}, {new BQSRTest(hg18Reference, privateTestDir + "originalQuals.1kg.chr1.1-1K.1RG.bam", "chr1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")},
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")},

View File

@ -189,7 +189,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test @Test
public void testComplexEvents() { public void testComplexEvents() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("complexEvents.vcf", "ALL"), baseTestString("complexEvents.vcf", "ALL", DEFAULT_REGION, b37KGReference),
0, 0,
Arrays.asList(EMPTY_MD5) Arrays.asList(EMPTY_MD5)
); );
@ -197,6 +197,15 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
executeTest("test validating complex events", spec); executeTest("test validating complex events", spec);
} }
@Test(description = "Checks out of order header contigs")
public void testOutOfOrderHeaderContigsError() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("complexEvents-outOfOrder.vcf", "ALL", DEFAULT_REGION, b37KGReference),
0, UserException.LexicographicallySortedSequenceDictionary.class);
executeTest("test out of order header contigs error", spec);
}
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") @Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
public void testUnusedAlleleFix() { public void testUnusedAlleleFix() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
@ -255,4 +264,31 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
// Set the log level back // Set the log level back
logger.setLevel(level); logger.setLevel(level);
} }
@Test(description = "Checks '''issue''' reported in issue https://github.com/broadinstitute/gsa-unstable/issues/964")
public void testWrongContigHeaderLengthError() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference),
0, UserException.IncompatibleSequenceDictionaries.class);
executeTest("test wrong header contig length error", spec);
}
@Test
public void testAllowWrongContigHeaderLengthDictIncompat() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ",
0, Arrays.asList(EMPTY_MD5));
executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec);
}
@Test
public void testAllowWrongContigHeaderLength() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ",
0, Arrays.asList(EMPTY_MD5));
executeTest("test to allow wrong header contig length, no compatibility checks", spec);
}
} }

View File

@ -886,7 +886,7 @@ public class GenomeAnalysisEngine {
// Compile a set of sequence names that exist in the BAM files. // Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
if (readsDictionary.size() == 0) { if (readsDictionary.isEmpty()) {
logger.info("Reads file is unmapped. Skipping validation against reference."); logger.info("Reads file is unmapped. Skipping validation against reference.");
return; return;
} }

View File

@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest {
@DataProvider(name="cramData") @DataProvider(name="cramData")
public Object[][] getCRAMData() { public Object[][] getCRAMData() {
return new Object[][] { return new Object[][] {
{"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, {"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
{"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, {"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
{"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"}, {"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"}, {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"},
{"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"}, {"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"},
{"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
{"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"}, {"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"},
{"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"},
{"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
{"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"},
{"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
{"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, {"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
}; };

View File

@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest {
public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s", public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s",
publicTestDir + "exampleFASTA.fasta", publicTestDir + "exampleFASTA.fasta",
publicTestDir + "exampleBAM.bam"); publicTestDir + "exampleBAM.bam");
public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425"; public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8";
private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) {

View File

@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
@Test @Test
public void testMissingReadGroup() { public void testMissingReadGroup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
0, 0,
UserException.ReadMissingReadGroup.class); UserException.ReadMissingReadGroup.class);
executeTest("test Missing Read Group", spec); executeTest("test Missing Read Group", spec);
@ -44,7 +44,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
@Test @Test
public void testUndefinedReadGroup() { public void testUndefinedReadGroup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
0, 0,
UserException.ReadHasUndefinedReadGroup.class); UserException.ReadHasUndefinedReadGroup.class);
executeTest("test Undefined Read Group", spec); executeTest("test Undefined Read Group", spec);

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.utils; package org.broadinstitute.gatk.utils;
import java.math.BigInteger;
import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -79,7 +80,7 @@ public class SequenceDictionaryUtils {
IDENTICAL, // the dictionaries are identical IDENTICAL, // the dictionaries are identical
COMMON_SUBSET, // there exists a common subset of equivalent contigs COMMON_SUBSET, // there exists a common subset of equivalent contigs
NO_COMMON_CONTIGS, // no overlap between dictionaries NO_COMMON_CONTIGS, // no overlap between dictionaries
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
// orders with respect to each other // orders with respect to each other
@ -92,7 +93,7 @@ public class SequenceDictionaryUtils {
* @param validationExclusion exclusions to validation * @param validationExclusion exclusions to validation
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
*/ */
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) {
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
validationExclusion == ValidationExclusion.TYPE.ALL ); validationExclusion == ValidationExclusion.TYPE.ALL );
} }
@ -133,15 +134,23 @@ public class SequenceDictionaryUtils {
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
case UNEQUAL_COMMON_CONTIGS: { case UNEQUAL_COMMON_CONTIGS: {
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); final List<SAMSequenceRecord> x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
SAMSequenceRecord elt1 = x.get(0); final SAMSequenceRecord elt1 = x.get(0);
SAMSequenceRecord elt2 = x.get(1); final SAMSequenceRecord elt2 = x.get(1);
String msg = "Found contigs with the same name but different lengths";
String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength());
if ( elt1.getMd5() != null )
contig1 += " and MD5 " + elt1.getMd5();
String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength());
if ( elt2.getMd5() != null )
contig2 += " and MD5 " + elt2.getMd5();
if ( elt1.getMd5() != null || elt2.getMd5() != null )
msg += " or MD5s:";
msg += "\n" + contig1 + "\n" + contig2;
// todo -- replace with toString when SAMSequenceRecord has a nice toString routine // todo -- replace with toString when SAMSequenceRecord has a nice toString routine
UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
name1, dict1, name2, dict2);
if ( allowNonFatalIncompabilities(validationExclusion) ) if ( allowNonFatalIncompabilities(validationExclusion) )
logger.warn(ex.getMessage()); logger.warn(ex.getMessage());
@ -226,7 +235,7 @@ public class SequenceDictionaryUtils {
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2); final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
if (commonContigs.size() == 0) if (commonContigs.isEmpty())
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
@ -250,12 +259,12 @@ public class SequenceDictionaryUtils {
* @param dict2 * @param dict2
* @return true if all of the common contigs are equivalent * @return true if all of the common contigs are equivalent
*/ */
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { private static boolean commonContigsHaveSameLengths(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null;
} }
/** /**
* Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns * Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns
* null if all common contigs are equivalent * null if all common contigs are equivalent
* *
* @param commonContigs * @param commonContigs
@ -263,7 +272,7 @@ public class SequenceDictionaryUtils {
* @param dict2 * @param dict2
* @return * @return
*/ */
private static List<SAMSequenceRecord> findDisequalCommonContigs(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { private static List<SAMSequenceRecord> findNotEqualCommonContigs(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
for ( String name : commonContigs ) { for ( String name : commonContigs ) {
SAMSequenceRecord elt1 = dict1.getSequence(name); SAMSequenceRecord elt1 = dict1.getSequence(name);
SAMSequenceRecord elt2 = dict2.getSequence(name); SAMSequenceRecord elt2 = dict2.getSequence(name);
@ -275,32 +284,32 @@ public class SequenceDictionaryUtils {
} }
/** /**
* Helper routine that returns two sequence records are equivalent, defined as having the same name and * Helper routine that determines if two sequence records are equivalent, defined as having the same name,
* lengths, if both are non-zero * lengths (if both are non-zero) and MD5 (if present)
* *
* @param me * @param record1 a SAMSequenceRecord
* @param that * @param record2 a SAMSequenceRecord
* @return * @return true if the records are equivalent, false otherwise
*/ */
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) {
if (me == that) return true; if ( record1 == record2 ) return true;
if (that == null) return false; if ( record1 == null || record2 == null ) return false;
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) // compare length
if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() )
return false; return false;
// todo -- reenable if we want to be really strict here // compare name
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { if ( !record1.getSequenceName().equals(record2.getSequenceName() ))
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); return false;
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (!thisMd5.equals(thatMd5)) { // compare MD5
// return false; if ( record1.getMd5() != null && record2.getMd5() != null ){
// } final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16);
// } final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16);
// else { if ( !firstMd5.equals(secondMd5) )
if (me.getSequenceName() != that.getSequenceName()) return false;
return false; // Compare using == since we intern() the Strings }
// }
return true; return true;
} }
@ -313,13 +322,13 @@ public class SequenceDictionaryUtils {
* @param dict * @param dict
* @return * @return
*/ */
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) {
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
return false; return false;
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
for ( SAMSequenceRecord elt : dict.getSequences() ) { for ( final SAMSequenceRecord elt : dict.getSequences() ) {
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
@ -355,7 +364,7 @@ public class SequenceDictionaryUtils {
* @param dict2 second SAMSequenceDictionary * @param dict2 second SAMSequenceDictionary
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
*/ */
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { private static boolean commonContigsAreInSameRelativeOrder(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
@ -376,8 +385,8 @@ public class SequenceDictionaryUtils {
* @param dict * @param dict
* @return * @return
*/ */
private static List<SAMSequenceRecord> getSequencesOfName(Set<String> commonContigs, SAMSequenceDictionary dict) { private static List<SAMSequenceRecord> getSequencesOfName(final Set<String> commonContigs, final SAMSequenceDictionary dict) {
List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size()); final List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
for ( String name : commonContigs ) { for ( String name : commonContigs ) {
l.add(dict.getSequence(name) ); l.add(dict.getSequence(name) );
} }
@ -401,7 +410,7 @@ public class SequenceDictionaryUtils {
* @param unsorted * @param unsorted
* @return * @return
*/ */
private static List<SAMSequenceRecord> sortSequenceListByIndex(List<SAMSequenceRecord> unsorted) { private static List<SAMSequenceRecord> sortSequenceListByIndex(final List<SAMSequenceRecord> unsorted) {
Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); Collections.sort(unsorted, new CompareSequenceRecordsByIndex());
return unsorted; return unsorted;
} }
@ -418,8 +427,8 @@ public class SequenceDictionaryUtils {
*/ */
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
for ( String commonContig : commonContigs ) { for ( String commonContig : commonContigs ) {
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
// Each common contig must have the same index in both dictionaries // Each common contig must have the same index in both dictionaries
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
@ -489,13 +498,13 @@ public class SequenceDictionaryUtils {
* @return * @return
*/ */
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
Set<String> intersectingSequenceNames = getContigNames(dict1); final Set<String> intersectingSequenceNames = getContigNames(dict1);
intersectingSequenceNames.retainAll(getContigNames(dict2)); intersectingSequenceNames.retainAll(getContigNames(dict2));
return intersectingSequenceNames; return intersectingSequenceNames;
} }
public static Set<String> getContigNames(SAMSequenceDictionary dict) { public static Set<String> getContigNames(SAMSequenceDictionary dict) {
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size())); final Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
contigNames.add(dictionaryEntry.getSequenceName()); contigNames.add(dictionaryEntry.getSequenceName());
return contigNames; return contigNames;
@ -515,7 +524,7 @@ public class SequenceDictionaryUtils {
throw new IllegalArgumentException("Sequence dictionary must be non-null"); throw new IllegalArgumentException("Sequence dictionary must be non-null");
} }
StringBuilder s = new StringBuilder("[ "); final StringBuilder s = new StringBuilder("[ ");
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
s.append(dictionaryEntry.getSequenceName()); s.append(dictionaryEntry.getSequenceName());

View File

@ -40,7 +40,7 @@ public class ValidationExclusion {
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
ALL // do not check for all of the above conditions, DEFAULT ALL // do not check for all of the above conditions, DEFAULT

View File

@ -102,7 +102,7 @@ public class IndexDictionaryUtils {
final SAMSequenceDictionary referenceDict, final SAMSequenceDictionary referenceDict,
final ValidationExclusion.TYPE validationExclusionType ) { final ValidationExclusion.TYPE validationExclusionType ) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0) if (trackDict == null || trackDict.isEmpty())
logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else { else {
Set<String> trackSequences = new TreeSet<String>(); Set<String> trackSequences = new TreeSet<String>();

View File

@ -93,7 +93,7 @@ public class RMDTrack {
* @param dict the sam sequence dictionary * @param dict the sam sequence dictionary
* @param codec the feature codec we use to decode this type * @param codec the feature codec we use to decode this type
*/ */
public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { public RMDTrack(final Class type, final String name, final File file, final AbstractFeatureReader reader, final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, final FeatureCodec codec) {
this.type = type; this.type = type;
this.name = name; this.name = name;
this.file = file; this.file = file;

View File

@ -26,6 +26,9 @@
package org.broadinstitute.gatk.utils.refdata.tracks; package org.broadinstitute.gatk.utils.refdata.tracks;
import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureCodec; import htsjdk.tribble.FeatureCodec;
@ -34,6 +37,7 @@ import htsjdk.tribble.TribbleException;
import htsjdk.tribble.index.Index; import htsjdk.tribble.index.Index;
import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.util.LittleEndianOutputStream; import htsjdk.tribble.util.LittleEndianOutputStream;
import org.broadinstitute.gatk.utils.SequenceDictionaryUtils;
import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor;
import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.ValidationExclusion; import org.broadinstitute.gatk.utils.ValidationExclusion;
@ -49,6 +53,8 @@ import org.broadinstitute.gatk.utils.instrumentation.Sizeof;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map; import java.util.Map;
@ -131,7 +137,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
* *
* @return an instance of the track * @return an instance of the track
*/ */
public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { public RMDTrack createInstanceOfTrack(final RMDTriplet fileDescriptor) {
String name = fileDescriptor.getName(); String name = fileDescriptor.getName();
File inputFile = new File(fileDescriptor.getFile()); File inputFile = new File(fileDescriptor.getFile());
@ -146,9 +152,43 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
else else
pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType());
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
validateVariantAgainstSequenceDictionary(name, descriptor.getName(), pair.first, pair.second);
return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile));
} }
/**
* Validate the VCF dictionary against the sequence dictionary.
*
* @param name the name of this specific track
* @param descriptorName the name of the feature
* @param reader the feature reader to use as the underlying data source
* @param dict the sam sequence dictionary
*/
private void validateVariantAgainstSequenceDictionary(final String name, final String descriptorName, final AbstractFeatureReader reader, final SAMSequenceDictionary dict ) throws UserException {
// only process if the variant is a VCF
if ( name.equals("variant") && descriptorName.equals("VCF") ){
if ( reader != null && dict != null && reader.getHeader() != null ){
final List<VCFContigHeaderLine> contigs = ((VCFHeader) reader.getHeader()).getContigLines();
if (contigs != null) {
// make the VCF dictionary from the contig header fields
final List<SAMSequenceRecord> vcfContigRecords = new ArrayList<SAMSequenceRecord>();
for (final VCFContigHeaderLine contig : contigs)
vcfContigRecords.add(contig.getSAMSequenceRecord());
// have VCF contig fields so can make a dictionary and compare it to the sequence dictionary
if (!vcfContigRecords.isEmpty()) {
final SAMSequenceDictionary vcfDictionary = new SAMSequenceDictionary(vcfContigRecords);
final SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(dict.getSequences());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, name, vcfDictionary, "sequence", sequenceDictionary, false, null);
}
}
}
}
}
/** /**
* Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream.
* @param codecClass Type of Tribble codec class to build. * @param codecClass Type of Tribble codec class to build.
@ -228,7 +268,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (sequenceDictionary.size() == 0 && dict != null) { if (sequenceDictionary.isEmpty() && dict != null) {
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
if ( ! disableAutoIndexCreation ) { if ( ! disableAutoIndexCreation ) {