From 745b8cc6d3ec726323df9aaa09f956a3a786f87f Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 24 Sep 2010 15:19:48 +0000 Subject: [PATCH] GATK now detects and UserExceptions when human lexicographically sorted data is provided git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4343 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/SequenceDictionaryUtils.java | 73 +++++++++++++++---- .../sting/utils/exceptions/UserException.java | 22 ++++-- .../DictionaryConsistencyIntegrationTest.java | 20 +++-- 3 files changed, 90 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java index 6435a0ab0..344a60963 100755 --- a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java @@ -50,6 +50,22 @@ import java.util.*; * is enabled before it blows up. */ public class SequenceDictionaryUtils { + // + // for detecting lexicographically sorted human references + // + + private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true; + + // hg18 + private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + public enum SequenceDictionaryCompatability { IDENTICAL, // the dictionaries are identical COMMON_SUBSET, // there exists a common subset of equivalent contigs @@ -87,6 +103,7 @@ public class SequenceDictionaryUtils { return; case NO_COMMON_CONTIGS: throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); + case UNEQUAL_COMMON_CONTIGS: { List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); SAMSequenceRecord elt1 = x.get(0); @@ -106,9 +123,12 @@ public class SequenceDictionaryUtils { } case NON_CANONICAL_HUMAN_ORDER: { - UserException ex = new UserException.IncompatibleSequenceDictionaries("Human genome sequence provided in non-canonical ordering. For safety's sake the GATK requires contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs", - name1, dict1, name2, dict2); - + UserException ex; + if ( nonCanonicalHumanContigOrder(dict1) ) + ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); + else + ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); + if ( allowNonFatalIncompabilities() ) logger.warn(ex.getMessage()); else @@ -136,14 +156,15 @@ public class SequenceDictionaryUtils { */ public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { // If there's no overlap between reads and reference, data will be bogus. Throw an exception. + if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) ) + return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER; + Set commonContigs = getCommonContigsByName(dict1, dict2); if (commonContigs.size() == 0) return SequenceDictionaryCompatability.NO_COMMON_CONTIGS; else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) ) return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS; - else if ( nonCanonicalHumanContigOrder( commonContigs, dict1, dict2 ) ) - return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER; else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) ) return SequenceDictionaryCompatability.OUT_OF_ORDER; else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) @@ -220,19 +241,45 @@ public class SequenceDictionaryUtils { } /** - * Placeholder for function that determines if the dicts come from the human genome that's been sorted in a - * non-canonical order. Returns just returns false (function not enabled). + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. * - * @param commonContigs - * @param dict1 - * @param dict2 + * @param dict * @return */ - private static boolean nonCanonicalHumanContigOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - // todo -- implement me if we decide to detect this case - return false; + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false + return false; + + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; + } + + if ( chr1 != null && chr2 != null && chr10 != null) { + // we found them all + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } else { + return false; + } } + /** + * Trivial helper that returns true if elt has the same length as rec1 or rec2 + * @param elt record to test + * @param rec1 first record to test for length equivalence + * @param rec2 first record to test for length equivalence + * @return true if elt has the same length as either rec1 or rec2 + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) { + return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength(); + } + + /** * Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the * common contigs in both dictionaries, sorting these according to their indices, and the walking through diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 0803b4e2c..f9902035d 100755 --- a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -187,16 +187,26 @@ public class UserException extends ReviewedStingException { super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2))); } + } - private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) { - String[] sequenceRecordNames = new String[sequenceDictionary.size()]; - int sequenceRecordIndex = 0; - for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) - sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); - return Arrays.deepToString(sequenceRecordNames); + public static class LexicographicallySortedSequenceDictionary extends UserException { + public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { + super(String.format("Lexicographically sorted human genome sequence detected in %s." + + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files" + + "\n %s contigs = %s", + name, name, prettyPrintSequenceRecords(dict))); } } + private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) { + String[] sequenceRecordNames = new String[sequenceDictionary.size()]; + int sequenceRecordIndex = 0; + for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) + sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); + return Arrays.deepToString(sequenceRecordNames); + } + public static class MissingWalker extends UserException { public MissingWalker(String walkerName, String message) { super(String.format("Walker %s is not available: %s", walkerName, message)); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java index 478c483f5..4945b09be 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java @@ -43,35 +43,43 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest { @Test public void fail1() { executeTest("b36xhg18", testVCF(b36KGReference, callsHG18)); } @Test public void fail2() { executeTest("b37xhg18", testVCF(b37KGReference, callsHG18)); } @Test public void fail3() { executeTest("hg19xhg18", testVCF(hg19Reference, callsHG18)); } - @Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18)); } + @Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18, UserException.LexicographicallySortedSequenceDictionary.class)); } // testing b36 calls @Test public void fail5() { executeTest("b36xb36", testVCF(hg18Reference, callsB36)); } @Test public void fail6() { executeTest("b37xb36", testVCF(b37KGReference, callsB36)); } @Test public void fail7() { executeTest("hg19xb36", testVCF(hg19Reference, callsB36)); } - @Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36)); } + @Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36, UserException.LexicographicallySortedSequenceDictionary.class)); } private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf) { + return testVCF(ref, vcf, UserException.IncompatibleSequenceDictionaries.class); + } + + private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf, Class c) { return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf " + vcf + " -F POS,CHROM -R " + ref + " -o %s", - 1, UserException.IncompatibleSequenceDictionaries.class); + 1, c); } @Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); } @Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); } @Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); } - @Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM)); } + @Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); } @Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); } @Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); } @Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); } - @Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM)); } + @Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); } private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) { + return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class); + } + + private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) { return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s", - 1, UserException.IncompatibleSequenceDictionaries.class); + 1, c); } }