/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.*; /** * Created by IntelliJ IDEA. * User: depristo * Date: Sep 10, 2010 * Time: 1:56:24 PM * * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, * from BAMs, or from RODs -- for consistency. The system supports two basic modes: get an enum state that * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will * blow up with a UserException if the dicts are too incompatible. * * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, * if available. Examines the Engine arguments to decided if the -U option to allow danger seq dict inconsistency * is enabled before it blows up. */ public class SequenceDictionaryUtils { // // for detecting lexicographically sorted human references // private static final boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true; // hg18 protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); // hg19 protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); // b36 protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); // b37 protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); public enum SequenceDictionaryCompatibility { IDENTICAL, // the dictionaries are identical COMMON_SUBSET, // there exists a common subset of equivalent contigs NO_COMMON_CONTIGS, // no overlap between dictionaries UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different // orders with respect to each other DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same // order with respect to each other, but one or more of them have different // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } } /** * @param validationExclusion exclusions to validation * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency */ private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || validationExclusion == ValidationExclusion.TYPE.ALL ); } /** * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then * UserExceptions are thrown with detailed error messages. If the engine is in permissive mode, then * logger warnings are generated instead. * * @param logger for warnings * @param validationExclusion exclusions to validation * @param name1 name associated with dict1 * @param dict1 the sequence dictionary dict1 * @param name2 name associated with dict2 * @param dict2 the sequence dictionary dict2 * @param isReadsToReferenceComparison true if one of the dictionaries comes from a reads data source (eg., a BAM), * and the other from a reference data source * @param intervals the user-specified genomic intervals: only required when isReadsToReferenceComparison is true, * otherwise can be null */ public static void validateDictionaries( final Logger logger, final ValidationExclusion.TYPE validationExclusion, final String name1, final SAMSequenceDictionary dict1, final String name2, final SAMSequenceDictionary dict2, final boolean isReadsToReferenceComparison, final GenomeLocSortedSet intervals ) { final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2); switch ( type ) { case IDENTICAL: return; case COMMON_SUBSET: return; case NO_COMMON_CONTIGS: throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); case UNEQUAL_COMMON_CONTIGS: { List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); SAMSequenceRecord elt1 = x.get(0); SAMSequenceRecord elt2 = x.get(1); // todo -- replace with toString when SAMSequenceRecord has a nice toString routine UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", name1, elt1.getSequenceName(), elt1.getSequenceLength(), name2, elt2.getSequenceName(), elt2.getSequenceLength()), name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); else throw ex; break; } case NON_CANONICAL_HUMAN_ORDER: { UserException ex; if ( nonCanonicalHumanContigOrder(dict1) ) ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); else ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); else throw ex; break; } case OUT_OF_ORDER: { UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); else throw ex; break; } case DIFFERENT_INDICES: { // This is currently only known to be problematic when the index mismatch is between a bam and the // reference AND when the user's intervals actually include one or more of the contigs that are // indexed differently from the reference. In this case, the engine will fail to correctly serve // up the reads from those contigs, so throw an exception unless unsafe operations are enabled. if ( isReadsToReferenceComparison && intervals != null ) { final Set misindexedContigs = findMisindexedContigsInIntervals(intervals, dict1, dict2); if ( ! misindexedContigs.isEmpty() ) { final String msg = String.format("The following contigs included in the intervals to process have " + "different indices in the sequence dictionaries for the reads vs. " + "the reference: %s. As a result, the GATK engine will not correctly " + "process reads from these contigs. You should either fix the sequence " + "dictionaries for your reads so that these contigs have the same indices " + "as in the sequence dictionary for your reference, or exclude these contigs " + "from your intervals. This error can be disabled via -U %s, " + "however this is not recommended as the GATK engine will not behave correctly.", misindexedContigs, ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY); final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); else throw ex; } } break; } default: throw new ReviewedStingException("Unexpected SequenceDictionaryComparison type: " + type); } } /** * Workhorse routine that takes two dictionaries and returns their compatibility. * * @param dict1 first sequence dictionary * @param dict2 second sequence dictionary * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries */ public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { if ( nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2) ) return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; final Set commonContigs = getCommonContigsByName(dict1, dict2); if (commonContigs.size() == 0) return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; else if ( ! commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2) ) return SequenceDictionaryCompatibility.OUT_OF_ORDER; else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) return SequenceDictionaryCompatibility.IDENTICAL; else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) return SequenceDictionaryCompatibility.DIFFERENT_INDICES; else { return SequenceDictionaryCompatibility.COMMON_SUBSET; } } /** * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means * that the seq records have the same length, if both are non-zero. * * @param commonContigs * @param dict1 * @param dict2 * @return true if all of the common contigs are equivalent */ private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; } /** * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns * null if all common contigs are equivalent * * @param commonContigs * @param dict1 * @param dict2 * @return */ private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { for ( String name : commonContigs ) { SAMSequenceRecord elt1 = dict1.getSequence(name); SAMSequenceRecord elt2 = dict2.getSequence(name); if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) return Arrays.asList(elt1,elt2); } return null; } /** * Helper routine that returns two sequence records are equivalent, defined as having the same name and * lengths, if both are non-zero * * @param me * @param that * @return */ private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { if (me == that) return true; if (that == null) return false; if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) return false; // todo -- reenable if we want to be really strict here // if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { // final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); // final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); // if (!thisMd5.equals(thatMd5)) { // return false; // } // } // else { if (me.getSequenceName() != that.getSequenceName()) return false; // Compare using == since we intern() the Strings // } return true; } /** * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these * are all matched, requiring that the order be chr1, chr2, chr10. * * @param dict * @return */ private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false return false; SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; for ( SAMSequenceRecord elt : dict.getSequences() ) { if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; } if ( chr1 != null && chr2 != null && chr10 != null) { // we found them all return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); } else { return false; } } /** * Trivial helper that returns true if elt has the same length as rec1 or rec2 * @param elt record to test * @param rec1 first record to test for length equivalence * @param rec2 first record to test for length equivalence * @return true if elt has the same length as either rec1 or rec2 */ private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) { return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength(); } /** * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting * these according to their indices, and then walking through the sorted list to ensure that each ordered contig * is equivalent * * @param commonContigs names of the contigs common to both dictionaries * @param dict1 first SAMSequenceDictionary * @param dict2 second SAMSequenceDictionary * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false */ private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { List list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); List list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); for ( int i = 0; i < list1.size(); i++ ) { SAMSequenceRecord elt1 = list1.get(i); SAMSequenceRecord elt2 = list2.get(i); if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) return false; } return true; } /** * Gets the subset of SAMSequenceRecords in commonContigs in dict * * @param commonContigs * @param dict * @return */ private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { List l = new ArrayList(commonContigs.size()); for ( String name : commonContigs ) { l.add(dict.getSequence(name) ); } return l; } /** * Compares sequence records by their order */ private static class CompareSequenceRecordsByIndex implements Comparator { public int compare(SAMSequenceRecord x, SAMSequenceRecord y) { return Integer.valueOf(x.getSequenceIndex()).compareTo(y.getSequenceIndex()); } } /** * Returns a sorted list of SAMSequenceRecords sorted by their indices. Note that the * list is modified in place, so the returned list is == to the unsorted list. * * @param unsorted * @return */ private static List sortSequenceListByIndex(List unsorted) { Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); return unsorted; } /** * Checks whether the common contigs in the given sequence dictionaries occur at the same indices * in both dictionaries * * @param commonContigs Set of names of the contigs that occur in both dictionaries * @param dict1 first sequence dictionary * @param dict2 second sequence dictionary * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, * otherwise false */ private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { for ( String commonContig : commonContigs ) { SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); // Each common contig must have the same index in both dictionaries if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { return false; } } return true; } /** * Gets the set of names of the contigs found in both sequence dictionaries that have different indices * in the two dictionaries. * * @param commonContigs Set of names of the contigs common to both dictionaries * @param dict1 first sequence dictionary * @param dict2 second sequence dictionary * @return a Set containing the names of the common contigs indexed differently in dict1 vs. dict2, * or an empty Set if there are no such contigs */ private static Set getDifferentlyIndexedCommonContigs( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { final Set differentlyIndexedCommonContigs = new LinkedHashSet(Utils.optimumHashSize(commonContigs.size())); for ( String commonContig : commonContigs ) { if ( dict1.getSequence(commonContig).getSequenceIndex() != dict2.getSequence(commonContig).getSequenceIndex() ) { differentlyIndexedCommonContigs.add(commonContig); } } return differentlyIndexedCommonContigs; } /** * Finds the names of any contigs indexed differently in the two sequence dictionaries that also * occur in the provided set of intervals. * * @param intervals GenomeLocSortedSet containing the intervals to check * @param dict1 first sequence dictionary * @param dict2 second sequence dictionary * @return a Set of the names of the contigs indexed differently in dict1 vs dict2 that also * occur in the provided intervals, or an empty Set if there are no such contigs */ private static Set findMisindexedContigsInIntervals( final GenomeLocSortedSet intervals, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { final Set differentlyIndexedCommonContigs = getDifferentlyIndexedCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); final Set misindexedContigsInIntervals = new LinkedHashSet(Utils.optimumHashSize(differentlyIndexedCommonContigs.size())); // We know differentlyIndexedCommonContigs is a HashSet, so this loop is O(intervals) for ( GenomeLoc interval : intervals ) { if ( differentlyIndexedCommonContigs.contains(interval.getContig()) ) { misindexedContigsInIntervals.add(interval.getContig()); } } return misindexedContigsInIntervals; } /** * Returns the set of contig names found in both dicts. * @param dict1 * @param dict2 * @return */ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { Set intersectingSequenceNames = getContigNames(dict1); intersectingSequenceNames.retainAll(getContigNames(dict2)); return intersectingSequenceNames; } public static Set getContigNames(SAMSequenceDictionary dict) { Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames; } /** * Returns a compact String representation of the sequence dictionary it's passed * * The format of the returned String is: * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] * * @param dict a non-null SAMSequenceDictionary * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed */ public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { if ( dict == null ) { throw new IllegalArgumentException("Sequence dictionary must be non-null"); } StringBuilder s = new StringBuilder("[ "); for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { s.append(dictionaryEntry.getSequenceName()); s.append("(length:"); s.append(dictionaryEntry.getSequenceLength()); s.append(") "); } s.append("]"); return s.toString(); } }