Merge pull request #1223 from broadinstitute/rhl_dict_check
Validate VCF with sequence dictionary
This commit is contained in:
commit
dc0e32e3aa
|
|
@ -120,7 +120,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},
|
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")},
|
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")},
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
|
{new BQSRTest(hg18Reference, privateTestDir + "originalQuals.1kg.chr1.1-1K.1RG.bam", "chr1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")},
|
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")},
|
||||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")},
|
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")},
|
||||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||||
|
|
|
||||||
|
|
@ -189,7 +189,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testComplexEvents() {
|
public void testComplexEvents() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString("complexEvents.vcf", "ALL"),
|
baseTestString("complexEvents.vcf", "ALL", DEFAULT_REGION, b37KGReference),
|
||||||
0,
|
0,
|
||||||
Arrays.asList(EMPTY_MD5)
|
Arrays.asList(EMPTY_MD5)
|
||||||
);
|
);
|
||||||
|
|
@ -197,6 +197,15 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||||
executeTest("test validating complex events", spec);
|
executeTest("test validating complex events", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(description = "Checks out of order header contigs")
|
||||||
|
public void testOutOfOrderHeaderContigsError() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("complexEvents-outOfOrder.vcf", "ALL", DEFAULT_REGION, b37KGReference),
|
||||||
|
0, UserException.LexicographicallySortedSequenceDictionary.class);
|
||||||
|
executeTest("test out of order header contigs error", spec);
|
||||||
|
}
|
||||||
|
|
||||||
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
|
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
|
||||||
public void testUnusedAlleleFix() {
|
public void testUnusedAlleleFix() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
|
@ -255,4 +264,31 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||||
// Set the log level back
|
// Set the log level back
|
||||||
logger.setLevel(level);
|
logger.setLevel(level);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(description = "Checks '''issue''' reported in issue https://github.com/broadinstitute/gsa-unstable/issues/964")
|
||||||
|
public void testWrongContigHeaderLengthError() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference),
|
||||||
|
0, UserException.IncompatibleSequenceDictionaries.class);
|
||||||
|
executeTest("test wrong header contig length error", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllowWrongContigHeaderLengthDictIncompat() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ",
|
||||||
|
0, Arrays.asList(EMPTY_MD5));
|
||||||
|
executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllowWrongContigHeaderLength() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ",
|
||||||
|
0, Arrays.asList(EMPTY_MD5));
|
||||||
|
executeTest("test to allow wrong header contig length, no compatibility checks", spec);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -886,8 +886,8 @@ public class GenomeAnalysisEngine {
|
||||||
// Compile a set of sequence names that exist in the BAM files.
|
// Compile a set of sequence names that exist in the BAM files.
|
||||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||||
|
|
||||||
if (readsDictionary.size() == 0) {
|
if (readsDictionary.isEmpty()) {
|
||||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest {
|
||||||
@DataProvider(name="cramData")
|
@DataProvider(name="cramData")
|
||||||
public Object[][] getCRAMData() {
|
public Object[][] getCRAMData() {
|
||||||
return new Object[][] {
|
return new Object[][] {
|
||||||
{"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
|
{"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
|
||||||
{"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
|
{"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
|
||||||
{"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"},
|
{"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"},
|
||||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"},
|
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"},
|
||||||
{"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"},
|
{"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"},
|
||||||
{"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
{"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
||||||
{"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"},
|
{"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"},
|
||||||
{"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
{"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
||||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
|
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"},
|
||||||
{"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
{"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
||||||
{"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
{"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
||||||
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
|
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"},
|
||||||
{"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
{"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
||||||
{"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
{"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest {
|
||||||
public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s",
|
public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s",
|
||||||
publicTestDir + "exampleFASTA.fasta",
|
publicTestDir + "exampleFASTA.fasta",
|
||||||
publicTestDir + "exampleBAM.bam");
|
publicTestDir + "exampleBAM.bam");
|
||||||
public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425";
|
public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8";
|
||||||
|
|
||||||
|
|
||||||
private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) {
|
private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) {
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testMissingReadGroup() {
|
public void testMissingReadGroup() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
|
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
|
||||||
0,
|
0,
|
||||||
UserException.ReadMissingReadGroup.class);
|
UserException.ReadMissingReadGroup.class);
|
||||||
executeTest("test Missing Read Group", spec);
|
executeTest("test Missing Read Group", spec);
|
||||||
|
|
@ -44,7 +44,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testUndefinedReadGroup() {
|
public void testUndefinedReadGroup() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
|
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
|
||||||
0,
|
0,
|
||||||
UserException.ReadHasUndefinedReadGroup.class);
|
UserException.ReadHasUndefinedReadGroup.class);
|
||||||
executeTest("test Undefined Read Group", spec);
|
executeTest("test Undefined Read Group", spec);
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.gatk.utils;
|
package org.broadinstitute.gatk.utils;
|
||||||
|
|
||||||
|
import java.math.BigInteger;
|
||||||
import htsjdk.samtools.SAMSequenceDictionary;
|
import htsjdk.samtools.SAMSequenceDictionary;
|
||||||
import htsjdk.samtools.SAMSequenceRecord;
|
import htsjdk.samtools.SAMSequenceRecord;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
@ -79,7 +80,7 @@ public class SequenceDictionaryUtils {
|
||||||
IDENTICAL, // the dictionaries are identical
|
IDENTICAL, // the dictionaries are identical
|
||||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||||
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
||||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
|
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s
|
||||||
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
||||||
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
|
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
|
||||||
// orders with respect to each other
|
// orders with respect to each other
|
||||||
|
|
@ -92,7 +93,7 @@ public class SequenceDictionaryUtils {
|
||||||
* @param validationExclusion exclusions to validation
|
* @param validationExclusion exclusions to validation
|
||||||
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
||||||
*/
|
*/
|
||||||
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) {
|
||||||
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
||||||
validationExclusion == ValidationExclusion.TYPE.ALL );
|
validationExclusion == ValidationExclusion.TYPE.ALL );
|
||||||
}
|
}
|
||||||
|
|
@ -133,15 +134,23 @@ public class SequenceDictionaryUtils {
|
||||||
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
||||||
|
|
||||||
case UNEQUAL_COMMON_CONTIGS: {
|
case UNEQUAL_COMMON_CONTIGS: {
|
||||||
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
final List<SAMSequenceRecord> x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||||
SAMSequenceRecord elt1 = x.get(0);
|
final SAMSequenceRecord elt1 = x.get(0);
|
||||||
SAMSequenceRecord elt2 = x.get(1);
|
final SAMSequenceRecord elt2 = x.get(1);
|
||||||
|
|
||||||
|
String msg = "Found contigs with the same name but different lengths";
|
||||||
|
String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength());
|
||||||
|
if ( elt1.getMd5() != null )
|
||||||
|
contig1 += " and MD5 " + elt1.getMd5();
|
||||||
|
String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength());
|
||||||
|
if ( elt2.getMd5() != null )
|
||||||
|
contig2 += " and MD5 " + elt2.getMd5();
|
||||||
|
if ( elt1.getMd5() != null || elt2.getMd5() != null )
|
||||||
|
msg += " or MD5s:";
|
||||||
|
msg += "\n" + contig1 + "\n" + contig2;
|
||||||
|
|
||||||
// todo -- replace with toString when SAMSequenceRecord has a nice toString routine
|
// todo -- replace with toString when SAMSequenceRecord has a nice toString routine
|
||||||
UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
|
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
|
||||||
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
|
|
||||||
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
|
|
||||||
name1, dict1, name2, dict2);
|
|
||||||
|
|
||||||
if ( allowNonFatalIncompabilities(validationExclusion) )
|
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||||
logger.warn(ex.getMessage());
|
logger.warn(ex.getMessage());
|
||||||
|
|
@ -226,7 +235,7 @@ public class SequenceDictionaryUtils {
|
||||||
|
|
||||||
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||||
|
|
||||||
if (commonContigs.size() == 0)
|
if (commonContigs.isEmpty())
|
||||||
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
||||||
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
|
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
|
||||||
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
||||||
|
|
@ -250,12 +259,12 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict2
|
* @param dict2
|
||||||
* @return true if all of the common contigs are equivalent
|
* @return true if all of the common contigs are equivalent
|
||||||
*/
|
*/
|
||||||
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static boolean commonContigsHaveSameLengths(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||||
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
|
return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns
|
* Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns
|
||||||
* null if all common contigs are equivalent
|
* null if all common contigs are equivalent
|
||||||
*
|
*
|
||||||
* @param commonContigs
|
* @param commonContigs
|
||||||
|
|
@ -263,7 +272,7 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict2
|
* @param dict2
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static List<SAMSequenceRecord> findDisequalCommonContigs(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static List<SAMSequenceRecord> findNotEqualCommonContigs(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||||
for ( String name : commonContigs ) {
|
for ( String name : commonContigs ) {
|
||||||
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
||||||
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
||||||
|
|
@ -275,32 +284,32 @@ public class SequenceDictionaryUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper routine that returns two sequence records are equivalent, defined as having the same name and
|
* Helper routine that determines if two sequence records are equivalent, defined as having the same name,
|
||||||
* lengths, if both are non-zero
|
* lengths (if both are non-zero) and MD5 (if present)
|
||||||
*
|
*
|
||||||
* @param me
|
* @param record1 a SAMSequenceRecord
|
||||||
* @param that
|
* @param record2 a SAMSequenceRecord
|
||||||
* @return
|
* @return true if the records are equivalent, false otherwise
|
||||||
*/
|
*/
|
||||||
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) {
|
||||||
if (me == that) return true;
|
if ( record1 == record2 ) return true;
|
||||||
if (that == null) return false;
|
if ( record1 == null || record2 == null ) return false;
|
||||||
|
|
||||||
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
|
// compare length
|
||||||
|
if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// todo -- reenable if we want to be really strict here
|
// compare name
|
||||||
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
|
if ( !record1.getSequenceName().equals(record2.getSequenceName() ))
|
||||||
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
|
return false;
|
||||||
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
|
|
||||||
// if (!thisMd5.equals(thatMd5)) {
|
// compare MD5
|
||||||
// return false;
|
if ( record1.getMd5() != null && record2.getMd5() != null ){
|
||||||
// }
|
final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16);
|
||||||
// }
|
final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16);
|
||||||
// else {
|
if ( !firstMd5.equals(secondMd5) )
|
||||||
if (me.getSequenceName() != that.getSequenceName())
|
return false;
|
||||||
return false; // Compare using == since we intern() the Strings
|
}
|
||||||
// }
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -313,13 +322,13 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict
|
* @param dict
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
|
private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) {
|
||||||
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
|
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
|
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
|
||||||
|
|
||||||
for ( SAMSequenceRecord elt : dict.getSequences() ) {
|
for ( final SAMSequenceRecord elt : dict.getSequences() ) {
|
||||||
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
|
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
|
||||||
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
|
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
|
||||||
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
|
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
|
||||||
|
|
@ -355,7 +364,7 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict2 second SAMSequenceDictionary
|
* @param dict2 second SAMSequenceDictionary
|
||||||
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
|
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
|
||||||
*/
|
*/
|
||||||
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static boolean commonContigsAreInSameRelativeOrder(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||||
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
||||||
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
||||||
|
|
||||||
|
|
@ -376,8 +385,8 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict
|
* @param dict
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static List<SAMSequenceRecord> getSequencesOfName(Set<String> commonContigs, SAMSequenceDictionary dict) {
|
private static List<SAMSequenceRecord> getSequencesOfName(final Set<String> commonContigs, final SAMSequenceDictionary dict) {
|
||||||
List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
|
final List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
|
||||||
for ( String name : commonContigs ) {
|
for ( String name : commonContigs ) {
|
||||||
l.add(dict.getSequence(name) );
|
l.add(dict.getSequence(name) );
|
||||||
}
|
}
|
||||||
|
|
@ -401,7 +410,7 @@ public class SequenceDictionaryUtils {
|
||||||
* @param unsorted
|
* @param unsorted
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static List<SAMSequenceRecord> sortSequenceListByIndex(List<SAMSequenceRecord> unsorted) {
|
private static List<SAMSequenceRecord> sortSequenceListByIndex(final List<SAMSequenceRecord> unsorted) {
|
||||||
Collections.sort(unsorted, new CompareSequenceRecordsByIndex());
|
Collections.sort(unsorted, new CompareSequenceRecordsByIndex());
|
||||||
return unsorted;
|
return unsorted;
|
||||||
}
|
}
|
||||||
|
|
@ -418,8 +427,8 @@ public class SequenceDictionaryUtils {
|
||||||
*/
|
*/
|
||||||
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
|
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
|
||||||
for ( String commonContig : commonContigs ) {
|
for ( String commonContig : commonContigs ) {
|
||||||
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
||||||
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
||||||
|
|
||||||
// Each common contig must have the same index in both dictionaries
|
// Each common contig must have the same index in both dictionaries
|
||||||
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
|
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
|
||||||
|
|
@ -489,13 +498,13 @@ public class SequenceDictionaryUtils {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||||
Set<String> intersectingSequenceNames = getContigNames(dict1);
|
final Set<String> intersectingSequenceNames = getContigNames(dict1);
|
||||||
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
||||||
return intersectingSequenceNames;
|
return intersectingSequenceNames;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||||
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
final Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
||||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||||
contigNames.add(dictionaryEntry.getSequenceName());
|
contigNames.add(dictionaryEntry.getSequenceName());
|
||||||
return contigNames;
|
return contigNames;
|
||||||
|
|
@ -515,7 +524,7 @@ public class SequenceDictionaryUtils {
|
||||||
throw new IllegalArgumentException("Sequence dictionary must be non-null");
|
throw new IllegalArgumentException("Sequence dictionary must be non-null");
|
||||||
}
|
}
|
||||||
|
|
||||||
StringBuilder s = new StringBuilder("[ ");
|
final StringBuilder s = new StringBuilder("[ ");
|
||||||
|
|
||||||
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
|
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
|
||||||
s.append(dictionaryEntry.getSequenceName());
|
s.append(dictionaryEntry.getSequenceName());
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ public class ValidationExclusion {
|
||||||
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
|
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
|
||||||
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
|
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
|
||||||
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
|
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
|
||||||
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
|
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities
|
||||||
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
|
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
|
||||||
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
|
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
|
||||||
ALL // do not check for all of the above conditions, DEFAULT
|
ALL // do not check for all of the above conditions, DEFAULT
|
||||||
|
|
|
||||||
|
|
@ -102,7 +102,7 @@ public class IndexDictionaryUtils {
|
||||||
final SAMSequenceDictionary referenceDict,
|
final SAMSequenceDictionary referenceDict,
|
||||||
final ValidationExclusion.TYPE validationExclusionType ) {
|
final ValidationExclusion.TYPE validationExclusionType ) {
|
||||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||||
if (trackDict == null || trackDict.size() == 0)
|
if (trackDict == null || trackDict.isEmpty())
|
||||||
logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||||
else {
|
else {
|
||||||
Set<String> trackSequences = new TreeSet<String>();
|
Set<String> trackSequences = new TreeSet<String>();
|
||||||
|
|
|
||||||
|
|
@ -93,7 +93,7 @@ public class RMDTrack {
|
||||||
* @param dict the sam sequence dictionary
|
* @param dict the sam sequence dictionary
|
||||||
* @param codec the feature codec we use to decode this type
|
* @param codec the feature codec we use to decode this type
|
||||||
*/
|
*/
|
||||||
public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
|
public RMDTrack(final Class type, final String name, final File file, final AbstractFeatureReader reader, final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, final FeatureCodec codec) {
|
||||||
this.type = type;
|
this.type = type;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.file = file;
|
this.file = file;
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,9 @@
|
||||||
package org.broadinstitute.gatk.utils.refdata.tracks;
|
package org.broadinstitute.gatk.utils.refdata.tracks;
|
||||||
|
|
||||||
import htsjdk.samtools.SAMSequenceDictionary;
|
import htsjdk.samtools.SAMSequenceDictionary;
|
||||||
|
import htsjdk.samtools.SAMSequenceRecord;
|
||||||
|
import htsjdk.variant.vcf.VCFContigHeaderLine;
|
||||||
|
import htsjdk.variant.vcf.VCFHeader;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import htsjdk.tribble.AbstractFeatureReader;
|
import htsjdk.tribble.AbstractFeatureReader;
|
||||||
import htsjdk.tribble.FeatureCodec;
|
import htsjdk.tribble.FeatureCodec;
|
||||||
|
|
@ -34,6 +37,7 @@ import htsjdk.tribble.TribbleException;
|
||||||
import htsjdk.tribble.index.Index;
|
import htsjdk.tribble.index.Index;
|
||||||
import htsjdk.tribble.index.IndexFactory;
|
import htsjdk.tribble.index.IndexFactory;
|
||||||
import htsjdk.tribble.util.LittleEndianOutputStream;
|
import htsjdk.tribble.util.LittleEndianOutputStream;
|
||||||
|
import org.broadinstitute.gatk.utils.SequenceDictionaryUtils;
|
||||||
import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor;
|
import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor;
|
||||||
import org.broadinstitute.gatk.utils.commandline.Tags;
|
import org.broadinstitute.gatk.utils.commandline.Tags;
|
||||||
import org.broadinstitute.gatk.utils.ValidationExclusion;
|
import org.broadinstitute.gatk.utils.ValidationExclusion;
|
||||||
|
|
@ -49,6 +53,8 @@ import org.broadinstitute.gatk.utils.instrumentation.Sizeof;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -131,7 +137,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
||||||
*
|
*
|
||||||
* @return an instance of the track
|
* @return an instance of the track
|
||||||
*/
|
*/
|
||||||
public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) {
|
public RMDTrack createInstanceOfTrack(final RMDTriplet fileDescriptor) {
|
||||||
String name = fileDescriptor.getName();
|
String name = fileDescriptor.getName();
|
||||||
File inputFile = new File(fileDescriptor.getFile());
|
File inputFile = new File(fileDescriptor.getFile());
|
||||||
|
|
||||||
|
|
@ -146,9 +152,43 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
||||||
else
|
else
|
||||||
pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType());
|
pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType());
|
||||||
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
|
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
|
||||||
|
|
||||||
|
validateVariantAgainstSequenceDictionary(name, descriptor.getName(), pair.first, pair.second);
|
||||||
|
|
||||||
return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile));
|
return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate the VCF dictionary against the sequence dictionary.
|
||||||
|
*
|
||||||
|
* @param name the name of this specific track
|
||||||
|
* @param descriptorName the name of the feature
|
||||||
|
* @param reader the feature reader to use as the underlying data source
|
||||||
|
* @param dict the sam sequence dictionary
|
||||||
|
*/
|
||||||
|
private void validateVariantAgainstSequenceDictionary(final String name, final String descriptorName, final AbstractFeatureReader reader, final SAMSequenceDictionary dict ) throws UserException {
|
||||||
|
// only process if the variant is a VCF
|
||||||
|
if ( name.equals("variant") && descriptorName.equals("VCF") ){
|
||||||
|
if ( reader != null && dict != null && reader.getHeader() != null ){
|
||||||
|
final List<VCFContigHeaderLine> contigs = ((VCFHeader) reader.getHeader()).getContigLines();
|
||||||
|
if (contigs != null) {
|
||||||
|
// make the VCF dictionary from the contig header fields
|
||||||
|
final List<SAMSequenceRecord> vcfContigRecords = new ArrayList<SAMSequenceRecord>();
|
||||||
|
for (final VCFContigHeaderLine contig : contigs)
|
||||||
|
vcfContigRecords.add(contig.getSAMSequenceRecord());
|
||||||
|
|
||||||
|
// have VCF contig fields so can make a dictionary and compare it to the sequence dictionary
|
||||||
|
if (!vcfContigRecords.isEmpty()) {
|
||||||
|
final SAMSequenceDictionary vcfDictionary = new SAMSequenceDictionary(vcfContigRecords);
|
||||||
|
final SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(dict.getSequences());
|
||||||
|
|
||||||
|
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, name, vcfDictionary, "sequence", sequenceDictionary, false, null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream.
|
* Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream.
|
||||||
* @param codecClass Type of Tribble codec class to build.
|
* @param codecClass Type of Tribble codec class to build.
|
||||||
|
|
@ -228,7 +268,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
||||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||||
|
|
||||||
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
||||||
if (sequenceDictionary.size() == 0 && dict != null) {
|
if (sequenceDictionary.isEmpty() && dict != null) {
|
||||||
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
||||||
|
|
||||||
if ( ! disableAutoIndexCreation ) {
|
if ( ! disableAutoIndexCreation ) {
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue