Merge pull request #55 from broadinstitute/dr_fix_sequence_dictionary_validation_GSA-768
Sequence dictionary validation: detect problematic contig indexing differences
This commit is contained in:
commit
ed5aff3702
|
|
@ -48,6 +48,7 @@ import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
||||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
|
|
@ -254,13 +255,16 @@ public class GenomeAnalysisEngine {
|
||||||
// Prepare the data for traversal.
|
// Prepare the data for traversal.
|
||||||
initializeDataSources();
|
initializeDataSources();
|
||||||
|
|
||||||
// initialize sampleDB
|
|
||||||
initializeSampleDB();
|
|
||||||
|
|
||||||
// initialize and validate the interval list
|
// initialize and validate the interval list
|
||||||
initializeIntervals();
|
initializeIntervals();
|
||||||
validateSuppliedIntervals();
|
validateSuppliedIntervals();
|
||||||
|
|
||||||
|
// check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary
|
||||||
|
validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources);
|
||||||
|
|
||||||
|
// initialize sampleDB
|
||||||
|
initializeSampleDB();
|
||||||
|
|
||||||
// our microscheduler, which is in charge of running everything
|
// our microscheduler, which is in charge of running everything
|
||||||
MicroScheduler microScheduler = createMicroscheduler();
|
MicroScheduler microScheduler = createMicroscheduler();
|
||||||
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
|
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
|
||||||
|
|
@ -753,9 +757,8 @@ public class GenomeAnalysisEngine {
|
||||||
* @param reads Reads data source.
|
* @param reads Reads data source.
|
||||||
* @param reference Reference data source.
|
* @param reference Reference data source.
|
||||||
* @param rods a collection of the reference ordered data tracks
|
* @param rods a collection of the reference ordered data tracks
|
||||||
* @param manager manager
|
|
||||||
*/
|
*/
|
||||||
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, RMDTrackBuilder manager) {
|
private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
|
||||||
if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null )
|
if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -772,11 +775,12 @@ public class GenomeAnalysisEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
// compare the reads to the reference
|
// compare the reads to the reference
|
||||||
SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, "reference", referenceDictionary);
|
SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary,
|
||||||
|
"reference", referenceDictionary, true, intervals);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ReferenceOrderedDataSource rod : rods)
|
for (ReferenceOrderedDataSource rod : rods)
|
||||||
manager.validateTrackSequenceDictionary(rod.getName(),rod.getSequenceDictionary(),referenceDictionary);
|
IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -858,9 +862,6 @@ public class GenomeAnalysisEngine {
|
||||||
genomeLocParser,
|
genomeLocParser,
|
||||||
flashbackData()));
|
flashbackData()));
|
||||||
|
|
||||||
// validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match.
|
|
||||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), dataSources, builder);
|
|
||||||
|
|
||||||
return dataSources;
|
return dataSources;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -101,7 +101,7 @@ public class IndexDictionaryUtils {
|
||||||
Set<String> trackSequences = new TreeSet<String>();
|
Set<String> trackSequences = new TreeSet<String>();
|
||||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -53,40 +53,55 @@ public class SequenceDictionaryUtils {
|
||||||
//
|
//
|
||||||
// for detecting lexicographically sorted human references
|
// for detecting lexicographically sorted human references
|
||||||
//
|
//
|
||||||
|
private static final boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
||||||
private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
|
||||||
|
|
||||||
// hg18
|
// hg18
|
||||||
private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
||||||
private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
||||||
private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
||||||
|
|
||||||
// hg19
|
// hg19
|
||||||
private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
||||||
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
||||||
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
||||||
|
|
||||||
|
// b36
|
||||||
|
protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719);
|
||||||
|
protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149);
|
||||||
|
protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737);
|
||||||
|
|
||||||
|
// b37
|
||||||
|
protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621);
|
||||||
|
protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373);
|
||||||
|
protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747);
|
||||||
|
|
||||||
|
|
||||||
public enum SequenceDictionaryCompatibility {
|
public enum SequenceDictionaryCompatibility {
|
||||||
IDENTICAL, // the dictionaries are identical
|
IDENTICAL, // the dictionaries are identical
|
||||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||||
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
||||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but aren't equivalent
|
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
|
||||||
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
||||||
OUT_OF_ORDER // the two dictionaries overlap but the contigs occur out of order w.r.t each other
|
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
|
||||||
|
// orders with respect to each other
|
||||||
|
DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
|
||||||
|
// order with respect to each other, but one or more of them have different
|
||||||
|
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param validationExclusion exclusions to validation
|
* @param validationExclusion exclusions to validation
|
||||||
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
||||||
*/
|
*/
|
||||||
public static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
||||||
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
||||||
validationExclusion == ValidationExclusion.TYPE.ALL );
|
validationExclusion == ValidationExclusion.TYPE.ALL );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Testings for compatbility between dict1 and dict2. If the dictionaries are incompatible, then UserExceptions are
|
* Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
|
||||||
* thrown with detailed error messages. If the engine is in permissive mode, then logger.warnings of generated instead
|
* UserExceptions are thrown with detailed error messages. If the engine is in permissive mode, then
|
||||||
|
* logger warnings are generated instead.
|
||||||
*
|
*
|
||||||
* @param logger for warnings
|
* @param logger for warnings
|
||||||
* @param validationExclusion exclusions to validation
|
* @param validationExclusion exclusions to validation
|
||||||
|
|
@ -94,9 +109,22 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict1 the sequence dictionary dict1
|
* @param dict1 the sequence dictionary dict1
|
||||||
* @param name2 name associated with dict2
|
* @param name2 name associated with dict2
|
||||||
* @param dict2 the sequence dictionary dict2
|
* @param dict2 the sequence dictionary dict2
|
||||||
|
* @param isReadsToReferenceComparison true if one of the dictionaries comes from a reads data source (eg., a BAM),
|
||||||
|
* and the other from a reference data source
|
||||||
|
* @param intervals the user-specified genomic intervals: only required when isReadsToReferenceComparison is true,
|
||||||
|
* otherwise can be null
|
||||||
*/
|
*/
|
||||||
public static void validateDictionaries(Logger logger, ValidationExclusion.TYPE validationExclusion, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) {
|
public static void validateDictionaries( final Logger logger,
|
||||||
SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2);
|
final ValidationExclusion.TYPE validationExclusion,
|
||||||
|
final String name1,
|
||||||
|
final SAMSequenceDictionary dict1,
|
||||||
|
final String name2,
|
||||||
|
final SAMSequenceDictionary dict2,
|
||||||
|
final boolean isReadsToReferenceComparison,
|
||||||
|
final GenomeLocSortedSet intervals ) {
|
||||||
|
|
||||||
|
final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2);
|
||||||
|
|
||||||
switch ( type ) {
|
switch ( type ) {
|
||||||
case IDENTICAL:
|
case IDENTICAL:
|
||||||
return;
|
return;
|
||||||
|
|
@ -134,15 +162,48 @@ public class SequenceDictionaryUtils {
|
||||||
logger.warn(ex.getMessage());
|
logger.warn(ex.getMessage());
|
||||||
else
|
else
|
||||||
throw ex;
|
throw ex;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case OUT_OF_ORDER: {
|
case OUT_OF_ORDER: {
|
||||||
UserException ex = new UserException.IncompatibleSequenceDictionaries("Order of contigs differences, which is unsafe", name1, dict1, name2, dict2);
|
UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2);
|
||||||
if ( allowNonFatalIncompabilities(validationExclusion) )
|
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||||
logger.warn(ex.getMessage());
|
logger.warn(ex.getMessage());
|
||||||
else
|
else
|
||||||
throw ex;
|
throw ex;
|
||||||
} break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case DIFFERENT_INDICES: {
|
||||||
|
// This is currently only known to be problematic when the index mismatch is between a bam and the
|
||||||
|
// reference AND when the user's intervals actually include one or more of the contigs that are
|
||||||
|
// indexed differently from the reference. In this case, the engine will fail to correctly serve
|
||||||
|
// up the reads from those contigs, so throw an exception unless unsafe operations are enabled.
|
||||||
|
if ( isReadsToReferenceComparison && intervals != null ) {
|
||||||
|
|
||||||
|
final Set<String> misindexedContigs = findMisindexedContigsInIntervals(intervals, dict1, dict2);
|
||||||
|
|
||||||
|
if ( ! misindexedContigs.isEmpty() ) {
|
||||||
|
final String msg = String.format("The following contigs included in the intervals to process have " +
|
||||||
|
"different indices in the sequence dictionaries for the reads vs. " +
|
||||||
|
"the reference: %s. As a result, the GATK engine will not correctly " +
|
||||||
|
"process reads from these contigs. You should either fix the sequence " +
|
||||||
|
"dictionaries for your reads so that these contigs have the same indices " +
|
||||||
|
"as in the sequence dictionary for your reference, or exclude these contigs " +
|
||||||
|
"from your intervals. This error can be disabled via -U %s, " +
|
||||||
|
"however this is not recommended as the GATK engine will not behave correctly.",
|
||||||
|
misindexedContigs, ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY);
|
||||||
|
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
|
||||||
|
|
||||||
|
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||||
|
logger.warn(ex.getMessage());
|
||||||
|
else
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw new ReviewedStingException("Unexpected SequenceDictionaryComparison type: " + type);
|
throw new ReviewedStingException("Unexpected SequenceDictionaryComparison type: " + type);
|
||||||
}
|
}
|
||||||
|
|
@ -151,32 +212,33 @@ public class SequenceDictionaryUtils {
|
||||||
/**
|
/**
|
||||||
* Workhorse routine that takes two dictionaries and returns their compatibility.
|
* Workhorse routine that takes two dictionaries and returns their compatibility.
|
||||||
*
|
*
|
||||||
* @param dict1
|
* @param dict1 first sequence dictionary
|
||||||
* @param dict2
|
* @param dict2 second sequence dictionary
|
||||||
* @return
|
* @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries
|
||||||
*/
|
*/
|
||||||
public static SequenceDictionaryCompatibility compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||||
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
if ( nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2) )
|
||||||
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
|
|
||||||
return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER;
|
return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER;
|
||||||
|
|
||||||
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||||
|
|
||||||
if (commonContigs.size() == 0)
|
if (commonContigs.size() == 0)
|
||||||
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
||||||
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
|
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
|
||||||
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
||||||
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
|
else if ( ! commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2) )
|
||||||
return SequenceDictionaryCompatibility.OUT_OF_ORDER;
|
return SequenceDictionaryCompatibility.OUT_OF_ORDER;
|
||||||
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
||||||
return SequenceDictionaryCompatibility.IDENTICAL;
|
return SequenceDictionaryCompatibility.IDENTICAL;
|
||||||
|
else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) )
|
||||||
|
return SequenceDictionaryCompatibility.DIFFERENT_INDICES;
|
||||||
else {
|
else {
|
||||||
return SequenceDictionaryCompatibility.COMMON_SUBSET;
|
return SequenceDictionaryCompatibility.COMMON_SUBSET;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalece means
|
* Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means
|
||||||
* that the seq records have the same length, if both are non-zero.
|
* that the seq records have the same length, if both are non-zero.
|
||||||
*
|
*
|
||||||
* @param commonContigs
|
* @param commonContigs
|
||||||
|
|
@ -184,7 +246,7 @@ public class SequenceDictionaryUtils {
|
||||||
* @param dict2
|
* @param dict2
|
||||||
* @return true if all of the common contigs are equivalent
|
* @return true if all of the common contigs are equivalent
|
||||||
*/
|
*/
|
||||||
private static boolean commonContigsAreEquivalent(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||||
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
|
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -201,7 +263,7 @@ public class SequenceDictionaryUtils {
|
||||||
for ( String name : commonContigs ) {
|
for ( String name : commonContigs ) {
|
||||||
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
||||||
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
||||||
if ( ! SequenceRecordsAreEquivalent(elt1, elt2) )
|
if ( ! sequenceRecordsAreEquivalent(elt1, elt2) )
|
||||||
return Arrays.asList(elt1,elt2);
|
return Arrays.asList(elt1,elt2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -216,12 +278,10 @@ public class SequenceDictionaryUtils {
|
||||||
* @param that
|
* @param that
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static boolean SequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
||||||
if (me == that) return true;
|
if (me == that) return true;
|
||||||
if (that == null) return false;
|
if (that == null) return false;
|
||||||
|
|
||||||
// I don't care if the indices are difference
|
|
||||||
//if (me.getSequenceIndex() != that.getSequenceIndex()) return false;
|
|
||||||
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
|
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
|
@ -280,18 +340,18 @@ public class SequenceDictionaryUtils {
|
||||||
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
|
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
|
* Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to
|
||||||
* common contigs in both dictionaries, sorting these according to their indices, and the walking through
|
* absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting
|
||||||
* the sorted list to ensure that each ordered contig is equivalent
|
* these according to their indices, and then walking through the sorted list to ensure that each ordered contig
|
||||||
|
* is equivalent
|
||||||
*
|
*
|
||||||
* @param commonContigs
|
* @param commonContigs names of the contigs common to both dictionaries
|
||||||
* @param dict1
|
* @param dict1 first SAMSequenceDictionary
|
||||||
* @param dict2
|
* @param dict2 second SAMSequenceDictionary
|
||||||
* @return
|
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
|
||||||
*/
|
*/
|
||||||
public static boolean commonContigsAreInOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||||
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
||||||
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
||||||
|
|
||||||
|
|
@ -321,10 +381,6 @@ public class SequenceDictionaryUtils {
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
// Utilities for comparing the order of sequence records
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compares sequence records by their order
|
* Compares sequence records by their order
|
||||||
*/
|
*/
|
||||||
|
|
@ -346,6 +402,81 @@ public class SequenceDictionaryUtils {
|
||||||
return unsorted;
|
return unsorted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether the common contigs in the given sequence dictionaries occur at the same indices
|
||||||
|
* in both dictionaries
|
||||||
|
*
|
||||||
|
* @param commonContigs Set of names of the contigs that occur in both dictionaries
|
||||||
|
* @param dict1 first sequence dictionary
|
||||||
|
* @param dict2 second sequence dictionary
|
||||||
|
* @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries,
|
||||||
|
* otherwise false
|
||||||
|
*/
|
||||||
|
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
|
||||||
|
for ( String commonContig : commonContigs ) {
|
||||||
|
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
||||||
|
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
||||||
|
|
||||||
|
// Each common contig must have the same index in both dictionaries
|
||||||
|
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the set of names of the contigs found in both sequence dictionaries that have different indices
|
||||||
|
* in the two dictionaries.
|
||||||
|
*
|
||||||
|
* @param commonContigs Set of names of the contigs common to both dictionaries
|
||||||
|
* @param dict1 first sequence dictionary
|
||||||
|
* @param dict2 second sequence dictionary
|
||||||
|
* @return a Set containing the names of the common contigs indexed differently in dict1 vs. dict2,
|
||||||
|
* or an empty Set if there are no such contigs
|
||||||
|
*/
|
||||||
|
private static Set<String> getDifferentlyIndexedCommonContigs( final Set<String> commonContigs,
|
||||||
|
final SAMSequenceDictionary dict1,
|
||||||
|
final SAMSequenceDictionary dict2 ) {
|
||||||
|
|
||||||
|
final Set<String> differentlyIndexedCommonContigs = new LinkedHashSet<String>(Utils.optimumHashSize(commonContigs.size()));
|
||||||
|
|
||||||
|
for ( String commonContig : commonContigs ) {
|
||||||
|
if ( dict1.getSequence(commonContig).getSequenceIndex() != dict2.getSequence(commonContig).getSequenceIndex() ) {
|
||||||
|
differentlyIndexedCommonContigs.add(commonContig);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return differentlyIndexedCommonContigs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the names of any contigs indexed differently in the two sequence dictionaries that also
|
||||||
|
* occur in the provided set of intervals.
|
||||||
|
*
|
||||||
|
* @param intervals GenomeLocSortedSet containing the intervals to check
|
||||||
|
* @param dict1 first sequence dictionary
|
||||||
|
* @param dict2 second sequence dictionary
|
||||||
|
* @return a Set of the names of the contigs indexed differently in dict1 vs dict2 that also
|
||||||
|
* occur in the provided intervals, or an empty Set if there are no such contigs
|
||||||
|
*/
|
||||||
|
private static Set<String> findMisindexedContigsInIntervals( final GenomeLocSortedSet intervals,
|
||||||
|
final SAMSequenceDictionary dict1,
|
||||||
|
final SAMSequenceDictionary dict2 ) {
|
||||||
|
|
||||||
|
final Set<String> differentlyIndexedCommonContigs = getDifferentlyIndexedCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||||
|
final Set<String> misindexedContigsInIntervals = new LinkedHashSet<String>(Utils.optimumHashSize(differentlyIndexedCommonContigs.size()));
|
||||||
|
|
||||||
|
// We know differentlyIndexedCommonContigs is a HashSet, so this loop is O(intervals)
|
||||||
|
for ( GenomeLoc interval : intervals ) {
|
||||||
|
if ( differentlyIndexedCommonContigs.contains(interval.getContig()) ) {
|
||||||
|
misindexedContigsInIntervals.add(interval.getContig());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return misindexedContigsInIntervals;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the set of contig names found in both dicts.
|
* Returns the set of contig names found in both dicts.
|
||||||
|
|
@ -360,9 +491,37 @@ public class SequenceDictionaryUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||||
Set<String> contigNames = new HashSet<String>((int)(dict.size() / 0.75f) + 1, 0.75f);
|
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
||||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||||
contigNames.add(dictionaryEntry.getSequenceName());
|
contigNames.add(dictionaryEntry.getSequenceName());
|
||||||
return contigNames;
|
return contigNames;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a compact String representation of the sequence dictionary it's passed
|
||||||
|
*
|
||||||
|
* The format of the returned String is:
|
||||||
|
* [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ]
|
||||||
|
*
|
||||||
|
* @param dict a non-null SAMSequenceDictionary
|
||||||
|
* @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
|
||||||
|
*/
|
||||||
|
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
|
||||||
|
if ( dict == null ) {
|
||||||
|
throw new IllegalArgumentException("Sequence dictionary must be non-null");
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder s = new StringBuilder("[ ");
|
||||||
|
|
||||||
|
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
|
||||||
|
s.append(dictionaryEntry.getSequenceName());
|
||||||
|
s.append("(length:");
|
||||||
|
s.append(dictionaryEntry.getSequenceLength());
|
||||||
|
s.append(") ");
|
||||||
|
}
|
||||||
|
|
||||||
|
s.append("]");
|
||||||
|
|
||||||
|
return s.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -64,22 +64,18 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
|
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
|
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM, "chr1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
|
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, "chr1", UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
|
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
|
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
|
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, "chr1", UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
|
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, String contig, Class c) {
|
||||||
return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class);
|
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L " + contig + ":10,000,000-11,000,000 -o %s",
|
||||||
}
|
|
||||||
|
|
||||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) {
|
|
||||||
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
|
|
||||||
1, c);
|
1, c);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,241 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
import org.testng.Assert;
|
||||||
|
|
||||||
|
import static org.broadinstitute.sting.utils.SequenceDictionaryUtils.*;
|
||||||
|
import static org.broadinstitute.sting.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class SequenceDictionaryUtilsUnitTest extends BaseTest {
|
||||||
|
|
||||||
|
private static Logger logger = Logger.getLogger(SequenceDictionaryUtilsUnitTest.class);
|
||||||
|
|
||||||
|
|
||||||
|
@DataProvider( name = "SequenceDictionaryDataProvider" )
|
||||||
|
public Object[][] generateSequenceDictionaryTestData() {
|
||||||
|
final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571);
|
||||||
|
final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309);
|
||||||
|
final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308);
|
||||||
|
|
||||||
|
final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||||
|
final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||||
|
final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class;
|
||||||
|
final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||||
|
final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||||
|
|
||||||
|
final List<SAMSequenceRecord> hg19Sequences = Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR10_HG19);
|
||||||
|
final GenomeLocParser hg19GenomeLocParser = new GenomeLocParser(new SAMSequenceDictionary(hg19Sequences));
|
||||||
|
final List<GenomeLoc> hg19AllContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1),
|
||||||
|
hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1),
|
||||||
|
hg19GenomeLocParser.createGenomeLoc("chr2", 0, 1),
|
||||||
|
hg19GenomeLocParser.createGenomeLoc("chr10", 0, 1));
|
||||||
|
final List<GenomeLoc> hg19PartialContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1),
|
||||||
|
hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1));
|
||||||
|
final GenomeLocSortedSet hg19AllContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19AllContigsIntervals);
|
||||||
|
final GenomeLocSortedSet hg19PartialContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19PartialContigsIntervals);
|
||||||
|
|
||||||
|
return new Object[][] {
|
||||||
|
// Identical dictionaries:
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), null, IDENTICAL, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, IDENTICAL, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), null, IDENTICAL, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, IDENTICAL, null, false, null },
|
||||||
|
|
||||||
|
// Dictionaries with a common subset:
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||||
|
|
||||||
|
// Dictionaries with no common contigs:
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
|
||||||
|
// Dictionaries with unequal common contigs:
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
|
||||||
|
// One or both dictionaries in non-canonical human order:
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||||
|
|
||||||
|
// Dictionaries with a common subset, but different relative ordering within that subset:
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||||
|
|
||||||
|
|
||||||
|
// Dictionaries with a common subset in the same relative order, but with different indices.
|
||||||
|
// This will only throw an exception during validation if isReadsToReferenceComparison is true,
|
||||||
|
// and there are intervals overlapping the misindexed contigs:
|
||||||
|
|
||||||
|
// These have isReadsToReferenceComparison == true and overlapping intervals, so we expect an exception:
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||||
|
|
||||||
|
// These have isReadsToReferenceComparison == true but no overlapping intervals, so we don't expect an exception:
|
||||||
|
{ Arrays.asList(CHR2_HG19, CHR10_HG19), Arrays.asList(CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||||
|
|
||||||
|
// These have isReadsToReferenceComparison == false, so we don't expect an exception:
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet },
|
||||||
|
|
||||||
|
|
||||||
|
// Tests for validation exclusions. Note that errors resulting from NO_COMMON_CONTIGs cannot be suppressed
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALL, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, UNEQUAL_COMMON_CONTIGS, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALL, UNEQUAL_COMMON_CONTIGS, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NON_CANONICAL_HUMAN_ORDER, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALL, NON_CANONICAL_HUMAN_ORDER, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, OUT_OF_ORDER, null, false, null },
|
||||||
|
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALL, OUT_OF_ORDER, null, false, null },
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet },
|
||||||
|
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALL, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test( dataProvider = "SequenceDictionaryDataProvider" )
|
||||||
|
public void testSequenceDictionaryValidation( final List<SAMSequenceRecord> firstDictionaryContigs,
|
||||||
|
final List<SAMSequenceRecord> secondDictionaryContigs,
|
||||||
|
final ValidationExclusion.TYPE validationExclusions,
|
||||||
|
final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility,
|
||||||
|
final Class expectedExceptionUponValidation,
|
||||||
|
final boolean isReadsToReferenceComparison,
|
||||||
|
final GenomeLocSortedSet intervals ) {
|
||||||
|
|
||||||
|
final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs);
|
||||||
|
final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs);
|
||||||
|
final String testDescription = String.format("First dictionary: %s Second dictionary: %s Validation exclusions: %s",
|
||||||
|
SequenceDictionaryUtils.getDictionaryAsString(firstDictionary),
|
||||||
|
SequenceDictionaryUtils.getDictionaryAsString(secondDictionary),
|
||||||
|
validationExclusions);
|
||||||
|
|
||||||
|
Exception exceptionThrown = null;
|
||||||
|
try {
|
||||||
|
SequenceDictionaryUtils.validateDictionaries(logger,
|
||||||
|
validationExclusions,
|
||||||
|
"firstDictionary",
|
||||||
|
firstDictionary,
|
||||||
|
"secondDictionary",
|
||||||
|
secondDictionary,
|
||||||
|
isReadsToReferenceComparison,
|
||||||
|
intervals);
|
||||||
|
}
|
||||||
|
catch ( Exception e ) {
|
||||||
|
exceptionThrown = e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( expectedExceptionUponValidation != null ) {
|
||||||
|
Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown),
|
||||||
|
String.format("Expected exception %s but saw %s instead. %s",
|
||||||
|
expectedExceptionUponValidation.getSimpleName(),
|
||||||
|
exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(),
|
||||||
|
testDescription));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Assert.assertTrue(exceptionThrown == null,
|
||||||
|
String.format("Expected no exception but saw exception %s instead. %s",
|
||||||
|
exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none",
|
||||||
|
testDescription));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test( dataProvider = "SequenceDictionaryDataProvider" )
|
||||||
|
public void testSequenceDictionaryComparison( final List<SAMSequenceRecord> firstDictionaryContigs,
|
||||||
|
final List<SAMSequenceRecord> secondDictionaryContigs,
|
||||||
|
final ValidationExclusion.TYPE validationExclusions,
|
||||||
|
final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility,
|
||||||
|
final Class expectedExceptionUponValidation,
|
||||||
|
final boolean isReadsToReferenceComparison,
|
||||||
|
final GenomeLocSortedSet intervals ) {
|
||||||
|
|
||||||
|
final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs);
|
||||||
|
final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs);
|
||||||
|
final String testDescription = String.format("First dictionary: %s Second dictionary: %s",
|
||||||
|
SequenceDictionaryUtils.getDictionaryAsString(firstDictionary),
|
||||||
|
SequenceDictionaryUtils.getDictionaryAsString(secondDictionary));
|
||||||
|
|
||||||
|
final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility =
|
||||||
|
SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary);
|
||||||
|
|
||||||
|
Assert.assertTrue(reportedCompatibility == dictionaryCompatibility,
|
||||||
|
String.format("Dictionary comparison should have returned %s but instead returned %s. %s",
|
||||||
|
dictionaryCompatibility, reportedCompatibility, testDescription));
|
||||||
|
}
|
||||||
|
|
||||||
|
private SAMSequenceDictionary createSequenceDictionary( final List<SAMSequenceRecord> contigs ) {
|
||||||
|
final List<SAMSequenceRecord> clonedContigs = new ArrayList<SAMSequenceRecord>(contigs.size());
|
||||||
|
|
||||||
|
// Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects
|
||||||
|
// across multiple dictionaries in tests
|
||||||
|
for ( SAMSequenceRecord contig : contigs ) {
|
||||||
|
clonedContigs.add(contig.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SAMSequenceDictionary(clonedContigs);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue