Validate VCF with sequence dictionary
This commit is contained in:
parent
29e86a0d01
commit
ccaddefa19
|
|
@ -120,7 +120,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
|
||||
{new BQSRTest(hg18Reference, privateTestDir + "originalQuals.1kg.chr1.1-1K.1RG.bam", "chr1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testComplexEvents() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("complexEvents.vcf", "ALL"),
|
||||
baseTestString("complexEvents.vcf", "ALL", DEFAULT_REGION, b37KGReference),
|
||||
0,
|
||||
Arrays.asList(EMPTY_MD5)
|
||||
);
|
||||
|
|
@ -197,6 +197,15 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
executeTest("test validating complex events", spec);
|
||||
}
|
||||
|
||||
@Test(description = "Checks out of order header contigs")
|
||||
public void testOutOfOrderHeaderContigsError() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("complexEvents-outOfOrder.vcf", "ALL", DEFAULT_REGION, b37KGReference),
|
||||
0, UserException.LexicographicallySortedSequenceDictionary.class);
|
||||
executeTest("test out of order header contigs error", spec);
|
||||
}
|
||||
|
||||
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
|
||||
public void testUnusedAlleleFix() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
@ -255,4 +264,31 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
// Set the log level back
|
||||
logger.setLevel(level);
|
||||
}
|
||||
|
||||
@Test(description = "Checks '''issue''' reported in issue https://github.com/broadinstitute/gsa-unstable/issues/964")
|
||||
public void testWrongContigHeaderLengthError() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference),
|
||||
0, UserException.IncompatibleSequenceDictionaries.class);
|
||||
executeTest("test wrong header contig length error", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllowWrongContigHeaderLengthDictIncompat() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ",
|
||||
0, Arrays.asList(EMPTY_MD5));
|
||||
executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllowWrongContigHeaderLength() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ",
|
||||
0, Arrays.asList(EMPTY_MD5));
|
||||
executeTest("test to allow wrong header contig length, no compatibility checks", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -886,8 +886,8 @@ public class GenomeAnalysisEngine {
|
|||
// Compile a set of sequence names that exist in the BAM files.
|
||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||
|
||||
if (readsDictionary.size() == 0) {
|
||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||
if (readsDictionary.isEmpty()) {
|
||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest {
|
|||
@DataProvider(name="cramData")
|
||||
public Object[][] getCRAMData() {
|
||||
return new Object[][] {
|
||||
{"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
|
||||
{"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
|
||||
{"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"},
|
||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"},
|
||||
{"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
|
||||
{"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
|
||||
{"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"},
|
||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"},
|
||||
{"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"},
|
||||
{"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
||||
{"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"},
|
||||
{"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
|
||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
|
||||
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"},
|
||||
{"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
||||
{"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
||||
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
|
||||
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"},
|
||||
{"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
|
||||
{"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest {
|
|||
public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s",
|
||||
publicTestDir + "exampleFASTA.fasta",
|
||||
publicTestDir + "exampleBAM.bam");
|
||||
public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425";
|
||||
public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8";
|
||||
|
||||
|
||||
private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) {
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testMissingReadGroup() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
|
||||
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
|
||||
0,
|
||||
UserException.ReadMissingReadGroup.class);
|
||||
executeTest("test Missing Read Group", spec);
|
||||
|
|
@ -44,7 +44,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testUndefinedReadGroup() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
|
||||
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
|
||||
0,
|
||||
UserException.ReadHasUndefinedReadGroup.class);
|
||||
executeTest("test Undefined Read Group", spec);
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.gatk.utils;
|
||||
|
||||
import java.math.BigInteger;
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
@ -79,7 +80,7 @@ public class SequenceDictionaryUtils {
|
|||
IDENTICAL, // the dictionaries are identical
|
||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
|
||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s
|
||||
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
||||
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
|
||||
// orders with respect to each other
|
||||
|
|
@ -92,7 +93,7 @@ public class SequenceDictionaryUtils {
|
|||
* @param validationExclusion exclusions to validation
|
||||
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
||||
*/
|
||||
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
||||
private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) {
|
||||
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
||||
validationExclusion == ValidationExclusion.TYPE.ALL );
|
||||
}
|
||||
|
|
@ -133,15 +134,23 @@ public class SequenceDictionaryUtils {
|
|||
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
||||
|
||||
case UNEQUAL_COMMON_CONTIGS: {
|
||||
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||
SAMSequenceRecord elt1 = x.get(0);
|
||||
SAMSequenceRecord elt2 = x.get(1);
|
||||
final List<SAMSequenceRecord> x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||
final SAMSequenceRecord elt1 = x.get(0);
|
||||
final SAMSequenceRecord elt2 = x.get(1);
|
||||
|
||||
String msg = "Found contigs with the same name but different lengths";
|
||||
String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength());
|
||||
if ( elt1.getMd5() != null )
|
||||
contig1 += " and MD5 " + elt1.getMd5();
|
||||
String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength());
|
||||
if ( elt2.getMd5() != null )
|
||||
contig2 += " and MD5 " + elt2.getMd5();
|
||||
if ( elt1.getMd5() != null || elt2.getMd5() != null )
|
||||
msg += " or MD5s:";
|
||||
msg += "\n" + contig1 + "\n" + contig2;
|
||||
|
||||
// todo -- replace with toString when SAMSequenceRecord has a nice toString routine
|
||||
UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
|
||||
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
|
||||
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
|
||||
name1, dict1, name2, dict2);
|
||||
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
|
||||
|
||||
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||
logger.warn(ex.getMessage());
|
||||
|
|
@ -226,7 +235,7 @@ public class SequenceDictionaryUtils {
|
|||
|
||||
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||
|
||||
if (commonContigs.size() == 0)
|
||||
if (commonContigs.isEmpty())
|
||||
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
||||
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
|
||||
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
||||
|
|
@ -250,12 +259,12 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict2
|
||||
* @return true if all of the common contigs are equivalent
|
||||
*/
|
||||
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
|
||||
private static boolean commonContigsHaveSameLengths(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||
return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns
|
||||
* Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns
|
||||
* null if all common contigs are equivalent
|
||||
*
|
||||
* @param commonContigs
|
||||
|
|
@ -263,7 +272,7 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict2
|
||||
* @return
|
||||
*/
|
||||
private static List<SAMSequenceRecord> findDisequalCommonContigs(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
private static List<SAMSequenceRecord> findNotEqualCommonContigs(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||
for ( String name : commonContigs ) {
|
||||
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
||||
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
||||
|
|
@ -275,32 +284,32 @@ public class SequenceDictionaryUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Helper routine that returns two sequence records are equivalent, defined as having the same name and
|
||||
* lengths, if both are non-zero
|
||||
* Helper routine that determines if two sequence records are equivalent, defined as having the same name,
|
||||
* lengths (if both are non-zero) and MD5 (if present)
|
||||
*
|
||||
* @param me
|
||||
* @param that
|
||||
* @return
|
||||
* @param record1 a SAMSequenceRecord
|
||||
* @param record2 a SAMSequenceRecord
|
||||
* @return true if the records are equivalent, false otherwise
|
||||
*/
|
||||
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
||||
if (me == that) return true;
|
||||
if (that == null) return false;
|
||||
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) {
|
||||
if ( record1 == record2 ) return true;
|
||||
if ( record1 == null || record2 == null ) return false;
|
||||
|
||||
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
|
||||
// compare length
|
||||
if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() )
|
||||
return false;
|
||||
|
||||
// todo -- reenable if we want to be really strict here
|
||||
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
|
||||
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
|
||||
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
|
||||
// if (!thisMd5.equals(thatMd5)) {
|
||||
// return false;
|
||||
// }
|
||||
// }
|
||||
// else {
|
||||
if (me.getSequenceName() != that.getSequenceName())
|
||||
return false; // Compare using == since we intern() the Strings
|
||||
// }
|
||||
// compare name
|
||||
if ( !record1.getSequenceName().equals(record2.getSequenceName() ))
|
||||
return false;
|
||||
|
||||
// compare MD5
|
||||
if ( record1.getMd5() != null && record2.getMd5() != null ){
|
||||
final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16);
|
||||
final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16);
|
||||
if ( !firstMd5.equals(secondMd5) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -313,13 +322,13 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict
|
||||
* @return
|
||||
*/
|
||||
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
|
||||
private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) {
|
||||
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
|
||||
return false;
|
||||
|
||||
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
|
||||
|
||||
for ( SAMSequenceRecord elt : dict.getSequences() ) {
|
||||
for ( final SAMSequenceRecord elt : dict.getSequences() ) {
|
||||
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
|
||||
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
|
||||
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
|
||||
|
|
@ -355,7 +364,7 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict2 second SAMSequenceDictionary
|
||||
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
|
||||
*/
|
||||
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
private static boolean commonContigsAreInSameRelativeOrder(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
||||
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
||||
|
||||
|
|
@ -376,8 +385,8 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict
|
||||
* @return
|
||||
*/
|
||||
private static List<SAMSequenceRecord> getSequencesOfName(Set<String> commonContigs, SAMSequenceDictionary dict) {
|
||||
List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
|
||||
private static List<SAMSequenceRecord> getSequencesOfName(final Set<String> commonContigs, final SAMSequenceDictionary dict) {
|
||||
final List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
|
||||
for ( String name : commonContigs ) {
|
||||
l.add(dict.getSequence(name) );
|
||||
}
|
||||
|
|
@ -401,7 +410,7 @@ public class SequenceDictionaryUtils {
|
|||
* @param unsorted
|
||||
* @return
|
||||
*/
|
||||
private static List<SAMSequenceRecord> sortSequenceListByIndex(List<SAMSequenceRecord> unsorted) {
|
||||
private static List<SAMSequenceRecord> sortSequenceListByIndex(final List<SAMSequenceRecord> unsorted) {
|
||||
Collections.sort(unsorted, new CompareSequenceRecordsByIndex());
|
||||
return unsorted;
|
||||
}
|
||||
|
|
@ -418,8 +427,8 @@ public class SequenceDictionaryUtils {
|
|||
*/
|
||||
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
|
||||
for ( String commonContig : commonContigs ) {
|
||||
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
||||
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
||||
final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
||||
final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
||||
|
||||
// Each common contig must have the same index in both dictionaries
|
||||
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
|
||||
|
|
@ -489,13 +498,13 @@ public class SequenceDictionaryUtils {
|
|||
* @return
|
||||
*/
|
||||
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
Set<String> intersectingSequenceNames = getContigNames(dict1);
|
||||
final Set<String> intersectingSequenceNames = getContigNames(dict1);
|
||||
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
||||
return intersectingSequenceNames;
|
||||
}
|
||||
|
||||
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
||||
final Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||
contigNames.add(dictionaryEntry.getSequenceName());
|
||||
return contigNames;
|
||||
|
|
@ -515,7 +524,7 @@ public class SequenceDictionaryUtils {
|
|||
throw new IllegalArgumentException("Sequence dictionary must be non-null");
|
||||
}
|
||||
|
||||
StringBuilder s = new StringBuilder("[ ");
|
||||
final StringBuilder s = new StringBuilder("[ ");
|
||||
|
||||
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
|
||||
s.append(dictionaryEntry.getSequenceName());
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ public class ValidationExclusion {
|
|||
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
|
||||
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
|
||||
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
|
||||
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
|
||||
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities
|
||||
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
|
||||
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
|
||||
ALL // do not check for all of the above conditions, DEFAULT
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ public class IndexDictionaryUtils {
|
|||
final SAMSequenceDictionary referenceDict,
|
||||
final ValidationExclusion.TYPE validationExclusionType ) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
if (trackDict == null || trackDict.isEmpty())
|
||||
logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class RMDTrack {
|
|||
* @param dict the sam sequence dictionary
|
||||
* @param codec the feature codec we use to decode this type
|
||||
*/
|
||||
public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
|
||||
public RMDTrack(final Class type, final String name, final File file, final AbstractFeatureReader reader, final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, final FeatureCodec codec) {
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
this.file = file;
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@
|
|||
package org.broadinstitute.gatk.utils.refdata.tracks;
|
||||
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.SAMSequenceRecord;
|
||||
import htsjdk.variant.vcf.VCFContigHeaderLine;
|
||||
import htsjdk.variant.vcf.VCFHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import htsjdk.tribble.AbstractFeatureReader;
|
||||
import htsjdk.tribble.FeatureCodec;
|
||||
|
|
@ -34,6 +37,7 @@ import htsjdk.tribble.TribbleException;
|
|||
import htsjdk.tribble.index.Index;
|
||||
import htsjdk.tribble.index.IndexFactory;
|
||||
import htsjdk.tribble.util.LittleEndianOutputStream;
|
||||
import org.broadinstitute.gatk.utils.SequenceDictionaryUtils;
|
||||
import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor;
|
||||
import org.broadinstitute.gatk.utils.commandline.Tags;
|
||||
import org.broadinstitute.gatk.utils.ValidationExclusion;
|
||||
|
|
@ -49,6 +53,8 @@ import org.broadinstitute.gatk.utils.instrumentation.Sizeof;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
|
@ -131,7 +137,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
*
|
||||
* @return an instance of the track
|
||||
*/
|
||||
public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) {
|
||||
public RMDTrack createInstanceOfTrack(final RMDTriplet fileDescriptor) {
|
||||
String name = fileDescriptor.getName();
|
||||
File inputFile = new File(fileDescriptor.getFile());
|
||||
|
||||
|
|
@ -146,9 +152,43 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
else
|
||||
pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType());
|
||||
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
|
||||
|
||||
validateVariantAgainstSequenceDictionary(name, descriptor.getName(), pair.first, pair.second);
|
||||
|
||||
return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the VCF dictionary against the sequence dictionary.
|
||||
*
|
||||
* @param name the name of this specific track
|
||||
* @param descriptorName the name of the feature
|
||||
* @param reader the feature reader to use as the underlying data source
|
||||
* @param dict the sam sequence dictionary
|
||||
*/
|
||||
private void validateVariantAgainstSequenceDictionary(final String name, final String descriptorName, final AbstractFeatureReader reader, final SAMSequenceDictionary dict ) throws UserException {
|
||||
// only process if the variant is a VCF
|
||||
if ( name.equals("variant") && descriptorName.equals("VCF") ){
|
||||
if ( reader != null && dict != null && reader.getHeader() != null ){
|
||||
final List<VCFContigHeaderLine> contigs = ((VCFHeader) reader.getHeader()).getContigLines();
|
||||
if (contigs != null) {
|
||||
// make the VCF dictionary from the contig header fields
|
||||
final List<SAMSequenceRecord> vcfContigRecords = new ArrayList<SAMSequenceRecord>();
|
||||
for (final VCFContigHeaderLine contig : contigs)
|
||||
vcfContigRecords.add(contig.getSAMSequenceRecord());
|
||||
|
||||
// have VCF contig fields so can make a dictionary and compare it to the sequence dictionary
|
||||
if (!vcfContigRecords.isEmpty()) {
|
||||
final SAMSequenceDictionary vcfDictionary = new SAMSequenceDictionary(vcfContigRecords);
|
||||
final SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(dict.getSequences());
|
||||
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, name, vcfDictionary, "sequence", sequenceDictionary, false, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream.
|
||||
* @param codecClass Type of Tribble codec class to build.
|
||||
|
|
@ -228,7 +268,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
|
||||
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
||||
if (sequenceDictionary.size() == 0 && dict != null) {
|
||||
if (sequenceDictionary.isEmpty() && dict != null) {
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
||||
|
||||
if ( ! disableAutoIndexCreation ) {
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue