Merge pull request #55 from broadinstitute/dr_fix_sequence_dictionary_validation_GSA-768
Sequence dictionary validation: detect problematic contig indexing differences
This commit is contained in:
commit
ed5aff3702
|
|
@ -48,6 +48,7 @@ import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
|||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
|
|
@ -254,13 +255,16 @@ public class GenomeAnalysisEngine {
|
|||
// Prepare the data for traversal.
|
||||
initializeDataSources();
|
||||
|
||||
// initialize sampleDB
|
||||
initializeSampleDB();
|
||||
|
||||
// initialize and validate the interval list
|
||||
initializeIntervals();
|
||||
validateSuppliedIntervals();
|
||||
|
||||
// check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary
|
||||
validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources);
|
||||
|
||||
// initialize sampleDB
|
||||
initializeSampleDB();
|
||||
|
||||
// our microscheduler, which is in charge of running everything
|
||||
MicroScheduler microScheduler = createMicroscheduler();
|
||||
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
|
||||
|
|
@ -753,9 +757,8 @@ public class GenomeAnalysisEngine {
|
|||
* @param reads Reads data source.
|
||||
* @param reference Reference data source.
|
||||
* @param rods a collection of the reference ordered data tracks
|
||||
* @param manager manager
|
||||
*/
|
||||
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, RMDTrackBuilder manager) {
|
||||
private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
|
||||
if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null )
|
||||
return;
|
||||
|
||||
|
|
@ -772,11 +775,12 @@ public class GenomeAnalysisEngine {
|
|||
}
|
||||
|
||||
// compare the reads to the reference
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, "reference", referenceDictionary);
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary,
|
||||
"reference", referenceDictionary, true, intervals);
|
||||
}
|
||||
|
||||
for (ReferenceOrderedDataSource rod : rods)
|
||||
manager.validateTrackSequenceDictionary(rod.getName(),rod.getSequenceDictionary(),referenceDictionary);
|
||||
IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -858,9 +862,6 @@ public class GenomeAnalysisEngine {
|
|||
genomeLocParser,
|
||||
flashbackData()));
|
||||
|
||||
// validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match.
|
||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), dataSources, builder);
|
||||
|
||||
return dataSources;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class IndexDictionaryUtils {
|
|||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,40 +53,55 @@ public class SequenceDictionaryUtils {
|
|||
//
|
||||
// for detecting lexicographically sorted human references
|
||||
//
|
||||
|
||||
private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
||||
private static final boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
||||
|
||||
// hg18
|
||||
private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
||||
private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
||||
private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
||||
protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
||||
protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
||||
protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
||||
|
||||
// hg19
|
||||
private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
||||
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
||||
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
||||
protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
||||
protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
||||
protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
||||
|
||||
// b36
|
||||
protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719);
|
||||
protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149);
|
||||
protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737);
|
||||
|
||||
// b37
|
||||
protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621);
|
||||
protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373);
|
||||
protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747);
|
||||
|
||||
|
||||
public enum SequenceDictionaryCompatibility {
|
||||
IDENTICAL, // the dictionaries are identical
|
||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||
NO_COMMON_CONTIGS, // no overlap between dictionaries
|
||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but aren't equivalent
|
||||
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
|
||||
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
|
||||
OUT_OF_ORDER // the two dictionaries overlap but the contigs occur out of order w.r.t each other
|
||||
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
|
||||
// orders with respect to each other
|
||||
DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
|
||||
// order with respect to each other, but one or more of them have different
|
||||
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
|
||||
}
|
||||
|
||||
/**
|
||||
* @param validationExclusion exclusions to validation
|
||||
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
|
||||
*/
|
||||
public static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
||||
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
|
||||
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
|
||||
validationExclusion == ValidationExclusion.TYPE.ALL );
|
||||
}
|
||||
|
||||
/**
|
||||
* Testings for compatbility between dict1 and dict2. If the dictionaries are incompatible, then UserExceptions are
|
||||
* thrown with detailed error messages. If the engine is in permissive mode, then logger.warnings of generated instead
|
||||
* Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
|
||||
* UserExceptions are thrown with detailed error messages. If the engine is in permissive mode, then
|
||||
* logger warnings are generated instead.
|
||||
*
|
||||
* @param logger for warnings
|
||||
* @param validationExclusion exclusions to validation
|
||||
|
|
@ -94,9 +109,22 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict1 the sequence dictionary dict1
|
||||
* @param name2 name associated with dict2
|
||||
* @param dict2 the sequence dictionary dict2
|
||||
* @param isReadsToReferenceComparison true if one of the dictionaries comes from a reads data source (eg., a BAM),
|
||||
* and the other from a reference data source
|
||||
* @param intervals the user-specified genomic intervals: only required when isReadsToReferenceComparison is true,
|
||||
* otherwise can be null
|
||||
*/
|
||||
public static void validateDictionaries(Logger logger, ValidationExclusion.TYPE validationExclusion, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) {
|
||||
SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2);
|
||||
public static void validateDictionaries( final Logger logger,
|
||||
final ValidationExclusion.TYPE validationExclusion,
|
||||
final String name1,
|
||||
final SAMSequenceDictionary dict1,
|
||||
final String name2,
|
||||
final SAMSequenceDictionary dict2,
|
||||
final boolean isReadsToReferenceComparison,
|
||||
final GenomeLocSortedSet intervals ) {
|
||||
|
||||
final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2);
|
||||
|
||||
switch ( type ) {
|
||||
case IDENTICAL:
|
||||
return;
|
||||
|
|
@ -134,15 +162,48 @@ public class SequenceDictionaryUtils {
|
|||
logger.warn(ex.getMessage());
|
||||
else
|
||||
throw ex;
|
||||
break;
|
||||
}
|
||||
|
||||
case OUT_OF_ORDER: {
|
||||
UserException ex = new UserException.IncompatibleSequenceDictionaries("Order of contigs differences, which is unsafe", name1, dict1, name2, dict2);
|
||||
UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2);
|
||||
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||
logger.warn(ex.getMessage());
|
||||
else
|
||||
throw ex;
|
||||
} break;
|
||||
break;
|
||||
}
|
||||
|
||||
case DIFFERENT_INDICES: {
|
||||
// This is currently only known to be problematic when the index mismatch is between a bam and the
|
||||
// reference AND when the user's intervals actually include one or more of the contigs that are
|
||||
// indexed differently from the reference. In this case, the engine will fail to correctly serve
|
||||
// up the reads from those contigs, so throw an exception unless unsafe operations are enabled.
|
||||
if ( isReadsToReferenceComparison && intervals != null ) {
|
||||
|
||||
final Set<String> misindexedContigs = findMisindexedContigsInIntervals(intervals, dict1, dict2);
|
||||
|
||||
if ( ! misindexedContigs.isEmpty() ) {
|
||||
final String msg = String.format("The following contigs included in the intervals to process have " +
|
||||
"different indices in the sequence dictionaries for the reads vs. " +
|
||||
"the reference: %s. As a result, the GATK engine will not correctly " +
|
||||
"process reads from these contigs. You should either fix the sequence " +
|
||||
"dictionaries for your reads so that these contigs have the same indices " +
|
||||
"as in the sequence dictionary for your reference, or exclude these contigs " +
|
||||
"from your intervals. This error can be disabled via -U %s, " +
|
||||
"however this is not recommended as the GATK engine will not behave correctly.",
|
||||
misindexedContigs, ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY);
|
||||
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
|
||||
|
||||
if ( allowNonFatalIncompabilities(validationExclusion) )
|
||||
logger.warn(ex.getMessage());
|
||||
else
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
throw new ReviewedStingException("Unexpected SequenceDictionaryComparison type: " + type);
|
||||
}
|
||||
|
|
@ -151,32 +212,33 @@ public class SequenceDictionaryUtils {
|
|||
/**
|
||||
* Workhorse routine that takes two dictionaries and returns their compatibility.
|
||||
*
|
||||
* @param dict1
|
||||
* @param dict2
|
||||
* @return
|
||||
* @param dict1 first sequence dictionary
|
||||
* @param dict2 second sequence dictionary
|
||||
* @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries
|
||||
*/
|
||||
public static SequenceDictionaryCompatibility compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
||||
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
|
||||
public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
|
||||
if ( nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2) )
|
||||
return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER;
|
||||
|
||||
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||
final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||
|
||||
if (commonContigs.size() == 0)
|
||||
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
|
||||
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
|
||||
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
|
||||
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
|
||||
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
|
||||
else if ( ! commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2) )
|
||||
return SequenceDictionaryCompatibility.OUT_OF_ORDER;
|
||||
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
||||
return SequenceDictionaryCompatibility.IDENTICAL;
|
||||
else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) )
|
||||
return SequenceDictionaryCompatibility.DIFFERENT_INDICES;
|
||||
else {
|
||||
return SequenceDictionaryCompatibility.COMMON_SUBSET;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalece means
|
||||
* Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means
|
||||
* that the seq records have the same length, if both are non-zero.
|
||||
*
|
||||
* @param commonContigs
|
||||
|
|
@ -184,7 +246,7 @@ public class SequenceDictionaryUtils {
|
|||
* @param dict2
|
||||
* @return true if all of the common contigs are equivalent
|
||||
*/
|
||||
private static boolean commonContigsAreEquivalent(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
|
||||
}
|
||||
|
||||
|
|
@ -201,7 +263,7 @@ public class SequenceDictionaryUtils {
|
|||
for ( String name : commonContigs ) {
|
||||
SAMSequenceRecord elt1 = dict1.getSequence(name);
|
||||
SAMSequenceRecord elt2 = dict2.getSequence(name);
|
||||
if ( ! SequenceRecordsAreEquivalent(elt1, elt2) )
|
||||
if ( ! sequenceRecordsAreEquivalent(elt1, elt2) )
|
||||
return Arrays.asList(elt1,elt2);
|
||||
}
|
||||
|
||||
|
|
@ -216,12 +278,10 @@ public class SequenceDictionaryUtils {
|
|||
* @param that
|
||||
* @return
|
||||
*/
|
||||
private static boolean SequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
||||
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
|
||||
if (me == that) return true;
|
||||
if (that == null) return false;
|
||||
|
||||
// I don't care if the indices are difference
|
||||
//if (me.getSequenceIndex() != that.getSequenceIndex()) return false;
|
||||
if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
|
||||
return false;
|
||||
|
||||
|
|
@ -280,18 +340,18 @@ public class SequenceDictionaryUtils {
|
|||
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
|
||||
* common contigs in both dictionaries, sorting these according to their indices, and the walking through
|
||||
* the sorted list to ensure that each ordered contig is equivalent
|
||||
* Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to
|
||||
* absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting
|
||||
* these according to their indices, and then walking through the sorted list to ensure that each ordered contig
|
||||
* is equivalent
|
||||
*
|
||||
* @param commonContigs
|
||||
* @param dict1
|
||||
* @param dict2
|
||||
* @return
|
||||
* @param commonContigs names of the contigs common to both dictionaries
|
||||
* @param dict1 first SAMSequenceDictionary
|
||||
* @param dict2 second SAMSequenceDictionary
|
||||
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
|
||||
*/
|
||||
public static boolean commonContigsAreInOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
|
||||
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));
|
||||
|
||||
|
|
@ -321,10 +381,6 @@ public class SequenceDictionaryUtils {
|
|||
return l;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
// Utilities for comparing the order of sequence records
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Compares sequence records by their order
|
||||
*/
|
||||
|
|
@ -346,6 +402,81 @@ public class SequenceDictionaryUtils {
|
|||
return unsorted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the common contigs in the given sequence dictionaries occur at the same indices
|
||||
* in both dictionaries
|
||||
*
|
||||
* @param commonContigs Set of names of the contigs that occur in both dictionaries
|
||||
* @param dict1 first sequence dictionary
|
||||
* @param dict2 second sequence dictionary
|
||||
* @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries,
|
||||
* otherwise false
|
||||
*/
|
||||
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
|
||||
for ( String commonContig : commonContigs ) {
|
||||
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
|
||||
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
|
||||
|
||||
// Each common contig must have the same index in both dictionaries
|
||||
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the set of names of the contigs found in both sequence dictionaries that have different indices
|
||||
* in the two dictionaries.
|
||||
*
|
||||
* @param commonContigs Set of names of the contigs common to both dictionaries
|
||||
* @param dict1 first sequence dictionary
|
||||
* @param dict2 second sequence dictionary
|
||||
* @return a Set containing the names of the common contigs indexed differently in dict1 vs. dict2,
|
||||
* or an empty Set if there are no such contigs
|
||||
*/
|
||||
private static Set<String> getDifferentlyIndexedCommonContigs( final Set<String> commonContigs,
|
||||
final SAMSequenceDictionary dict1,
|
||||
final SAMSequenceDictionary dict2 ) {
|
||||
|
||||
final Set<String> differentlyIndexedCommonContigs = new LinkedHashSet<String>(Utils.optimumHashSize(commonContigs.size()));
|
||||
|
||||
for ( String commonContig : commonContigs ) {
|
||||
if ( dict1.getSequence(commonContig).getSequenceIndex() != dict2.getSequence(commonContig).getSequenceIndex() ) {
|
||||
differentlyIndexedCommonContigs.add(commonContig);
|
||||
}
|
||||
}
|
||||
|
||||
return differentlyIndexedCommonContigs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the names of any contigs indexed differently in the two sequence dictionaries that also
|
||||
* occur in the provided set of intervals.
|
||||
*
|
||||
* @param intervals GenomeLocSortedSet containing the intervals to check
|
||||
* @param dict1 first sequence dictionary
|
||||
* @param dict2 second sequence dictionary
|
||||
* @return a Set of the names of the contigs indexed differently in dict1 vs dict2 that also
|
||||
* occur in the provided intervals, or an empty Set if there are no such contigs
|
||||
*/
|
||||
private static Set<String> findMisindexedContigsInIntervals( final GenomeLocSortedSet intervals,
|
||||
final SAMSequenceDictionary dict1,
|
||||
final SAMSequenceDictionary dict2 ) {
|
||||
|
||||
final Set<String> differentlyIndexedCommonContigs = getDifferentlyIndexedCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||
final Set<String> misindexedContigsInIntervals = new LinkedHashSet<String>(Utils.optimumHashSize(differentlyIndexedCommonContigs.size()));
|
||||
|
||||
// We know differentlyIndexedCommonContigs is a HashSet, so this loop is O(intervals)
|
||||
for ( GenomeLoc interval : intervals ) {
|
||||
if ( differentlyIndexedCommonContigs.contains(interval.getContig()) ) {
|
||||
misindexedContigsInIntervals.add(interval.getContig());
|
||||
}
|
||||
}
|
||||
|
||||
return misindexedContigsInIntervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of contig names found in both dicts.
|
||||
|
|
@ -360,9 +491,37 @@ public class SequenceDictionaryUtils {
|
|||
}
|
||||
|
||||
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||
Set<String> contigNames = new HashSet<String>((int)(dict.size() / 0.75f) + 1, 0.75f);
|
||||
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
|
||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||
contigNames.add(dictionaryEntry.getSequenceName());
|
||||
return contigNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a compact String representation of the sequence dictionary it's passed
|
||||
*
|
||||
* The format of the returned String is:
|
||||
* [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ]
|
||||
*
|
||||
* @param dict a non-null SAMSequenceDictionary
|
||||
* @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
|
||||
*/
|
||||
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
|
||||
if ( dict == null ) {
|
||||
throw new IllegalArgumentException("Sequence dictionary must be non-null");
|
||||
}
|
||||
|
||||
StringBuilder s = new StringBuilder("[ ");
|
||||
|
||||
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
|
||||
s.append(dictionaryEntry.getSequenceName());
|
||||
s.append("(length:");
|
||||
s.append(dictionaryEntry.getSequenceLength());
|
||||
s.append(") ");
|
||||
}
|
||||
|
||||
s.append("]");
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,22 +64,18 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
|
||||
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
|
||||
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
|
||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM, "chr1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, "chr1", UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
|
||||
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
|
||||
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
|
||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM, "1", UserException.IncompatibleSequenceDictionaries.class)); }
|
||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, "chr1", UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
|
||||
return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class);
|
||||
}
|
||||
|
||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) {
|
||||
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
|
||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, String contig, Class c) {
|
||||
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L " + contig + ":10,000,000-11,000,000 -o %s",
|
||||
1, c);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,241 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import static org.broadinstitute.sting.utils.SequenceDictionaryUtils.*;
|
||||
import static org.broadinstitute.sting.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class SequenceDictionaryUtilsUnitTest extends BaseTest {
|
||||
|
||||
private static Logger logger = Logger.getLogger(SequenceDictionaryUtilsUnitTest.class);
|
||||
|
||||
|
||||
@DataProvider( name = "SequenceDictionaryDataProvider" )
|
||||
public Object[][] generateSequenceDictionaryTestData() {
|
||||
final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571);
|
||||
final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309);
|
||||
final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308);
|
||||
|
||||
final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||
final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||
final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class;
|
||||
final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||
final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class;
|
||||
|
||||
final List<SAMSequenceRecord> hg19Sequences = Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR10_HG19);
|
||||
final GenomeLocParser hg19GenomeLocParser = new GenomeLocParser(new SAMSequenceDictionary(hg19Sequences));
|
||||
final List<GenomeLoc> hg19AllContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1),
|
||||
hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1),
|
||||
hg19GenomeLocParser.createGenomeLoc("chr2", 0, 1),
|
||||
hg19GenomeLocParser.createGenomeLoc("chr10", 0, 1));
|
||||
final List<GenomeLoc> hg19PartialContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1),
|
||||
hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1));
|
||||
final GenomeLocSortedSet hg19AllContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19AllContigsIntervals);
|
||||
final GenomeLocSortedSet hg19PartialContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19PartialContigsIntervals);
|
||||
|
||||
return new Object[][] {
|
||||
// Identical dictionaries:
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), null, IDENTICAL, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, IDENTICAL, null, false, null },
|
||||
{ Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), null, IDENTICAL, null, false, null },
|
||||
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, IDENTICAL, null, false, null },
|
||||
|
||||
// Dictionaries with a common subset:
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null },
|
||||
|
||||
// Dictionaries with no common contigs:
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
|
||||
// Dictionaries with unequal common contigs:
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
|
||||
// One or both dictionaries in non-canonical human order:
|
||||
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null },
|
||||
|
||||
// Dictionaries with a common subset, but different relative ordering within that subset:
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null },
|
||||
|
||||
|
||||
// Dictionaries with a common subset in the same relative order, but with different indices.
|
||||
// This will only throw an exception during validation if isReadsToReferenceComparison is true,
|
||||
// and there are intervals overlapping the misindexed contigs:
|
||||
|
||||
// These have isReadsToReferenceComparison == true and overlapping intervals, so we expect an exception:
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet },
|
||||
|
||||
// These have isReadsToReferenceComparison == true but no overlapping intervals, so we don't expect an exception:
|
||||
{ Arrays.asList(CHR2_HG19, CHR10_HG19), Arrays.asList(CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet },
|
||||
|
||||
// These have isReadsToReferenceComparison == false, so we don't expect an exception:
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet },
|
||||
|
||||
|
||||
// Tests for validation exclusions. Note that errors resulting from NO_COMMON_CONTIGs cannot be suppressed
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALL, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null },
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, UNEQUAL_COMMON_CONTIGS, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALL, UNEQUAL_COMMON_CONTIGS, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NON_CANONICAL_HUMAN_ORDER, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALL, NON_CANONICAL_HUMAN_ORDER, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, OUT_OF_ORDER, null, false, null },
|
||||
{ Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALL, OUT_OF_ORDER, null, false, null },
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet },
|
||||
{ Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALL, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet }
|
||||
};
|
||||
}
|
||||
|
||||
@Test( dataProvider = "SequenceDictionaryDataProvider" )
|
||||
public void testSequenceDictionaryValidation( final List<SAMSequenceRecord> firstDictionaryContigs,
|
||||
final List<SAMSequenceRecord> secondDictionaryContigs,
|
||||
final ValidationExclusion.TYPE validationExclusions,
|
||||
final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility,
|
||||
final Class expectedExceptionUponValidation,
|
||||
final boolean isReadsToReferenceComparison,
|
||||
final GenomeLocSortedSet intervals ) {
|
||||
|
||||
final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs);
|
||||
final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs);
|
||||
final String testDescription = String.format("First dictionary: %s Second dictionary: %s Validation exclusions: %s",
|
||||
SequenceDictionaryUtils.getDictionaryAsString(firstDictionary),
|
||||
SequenceDictionaryUtils.getDictionaryAsString(secondDictionary),
|
||||
validationExclusions);
|
||||
|
||||
Exception exceptionThrown = null;
|
||||
try {
|
||||
SequenceDictionaryUtils.validateDictionaries(logger,
|
||||
validationExclusions,
|
||||
"firstDictionary",
|
||||
firstDictionary,
|
||||
"secondDictionary",
|
||||
secondDictionary,
|
||||
isReadsToReferenceComparison,
|
||||
intervals);
|
||||
}
|
||||
catch ( Exception e ) {
|
||||
exceptionThrown = e;
|
||||
}
|
||||
|
||||
if ( expectedExceptionUponValidation != null ) {
|
||||
Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown),
|
||||
String.format("Expected exception %s but saw %s instead. %s",
|
||||
expectedExceptionUponValidation.getSimpleName(),
|
||||
exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(),
|
||||
testDescription));
|
||||
}
|
||||
else {
|
||||
Assert.assertTrue(exceptionThrown == null,
|
||||
String.format("Expected no exception but saw exception %s instead. %s",
|
||||
exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none",
|
||||
testDescription));
|
||||
}
|
||||
}
|
||||
|
||||
@Test( dataProvider = "SequenceDictionaryDataProvider" )
|
||||
public void testSequenceDictionaryComparison( final List<SAMSequenceRecord> firstDictionaryContigs,
|
||||
final List<SAMSequenceRecord> secondDictionaryContigs,
|
||||
final ValidationExclusion.TYPE validationExclusions,
|
||||
final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility,
|
||||
final Class expectedExceptionUponValidation,
|
||||
final boolean isReadsToReferenceComparison,
|
||||
final GenomeLocSortedSet intervals ) {
|
||||
|
||||
final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs);
|
||||
final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs);
|
||||
final String testDescription = String.format("First dictionary: %s Second dictionary: %s",
|
||||
SequenceDictionaryUtils.getDictionaryAsString(firstDictionary),
|
||||
SequenceDictionaryUtils.getDictionaryAsString(secondDictionary));
|
||||
|
||||
final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility =
|
||||
SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary);
|
||||
|
||||
Assert.assertTrue(reportedCompatibility == dictionaryCompatibility,
|
||||
String.format("Dictionary comparison should have returned %s but instead returned %s. %s",
|
||||
dictionaryCompatibility, reportedCompatibility, testDescription));
|
||||
}
|
||||
|
||||
private SAMSequenceDictionary createSequenceDictionary( final List<SAMSequenceRecord> contigs ) {
|
||||
final List<SAMSequenceRecord> clonedContigs = new ArrayList<SAMSequenceRecord>(contigs.size());
|
||||
|
||||
// Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects
|
||||
// across multiple dictionaries in tests
|
||||
for ( SAMSequenceRecord contig : contigs ) {
|
||||
clonedContigs.add(contig.clone());
|
||||
}
|
||||
|
||||
return new SAMSequenceDictionary(clonedContigs);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue