GATK now detects and UserExceptions when human lexicographically sorted data is provided

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4343 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-09-24 15:19:48 +00:00
parent 67bcf3a7e4
commit 745b8cc6d3
3 changed files with 90 additions and 25 deletions

View File

@ -50,6 +50,22 @@ import java.util.*;
* is enabled before it blows up.
*/
public class SequenceDictionaryUtils {
//
// for detecting lexicographically sorted human references
//
private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
// hg18
private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
// hg19
private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
public enum SequenceDictionaryCompatability {
IDENTICAL, // the dictionaries are identical
COMMON_SUBSET, // there exists a common subset of equivalent contigs
@ -87,6 +103,7 @@ public class SequenceDictionaryUtils {
return;
case NO_COMMON_CONTIGS:
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
case UNEQUAL_COMMON_CONTIGS: {
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
SAMSequenceRecord elt1 = x.get(0);
@ -106,9 +123,12 @@ public class SequenceDictionaryUtils {
}
case NON_CANONICAL_HUMAN_ORDER: {
UserException ex = new UserException.IncompatibleSequenceDictionaries("Human genome sequence provided in non-canonical ordering. For safety's sake the GATK requires contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs",
name1, dict1, name2, dict2);
UserException ex;
if ( nonCanonicalHumanContigOrder(dict1) )
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
else
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
if ( allowNonFatalIncompabilities() )
logger.warn(ex.getMessage());
else
@ -136,14 +156,15 @@ public class SequenceDictionaryUtils {
*/
public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
if (commonContigs.size() == 0)
return SequenceDictionaryCompatability.NO_COMMON_CONTIGS;
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS;
else if ( nonCanonicalHumanContigOrder( commonContigs, dict1, dict2 ) )
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
return SequenceDictionaryCompatability.OUT_OF_ORDER;
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
@ -220,19 +241,45 @@ public class SequenceDictionaryUtils {
}
/**
* Placeholder for function that determines if the dicts come from the human genome that's been sorted in a
* non-canonical order. Returns just returns false (function not enabled).
* A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's
* lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these
* are all matched, requiring that the order be chr1, chr2, chr10.
*
* @param commonContigs
* @param dict1
* @param dict2
* @param dict
* @return
*/
private static boolean nonCanonicalHumanContigOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
// todo -- implement me if we decide to detect this case
return false;
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
return false;
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
for ( SAMSequenceRecord elt : dict.getSequences() ) {
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
}
if ( chr1 != null && chr2 != null && chr10 != null) {
// we found them all
return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() );
} else {
return false;
}
}
/**
* Trivial helper that returns true if elt has the same length as rec1 or rec2
* @param elt record to test
* @param rec1 first record to test for length equivalence
* @param rec2 first record to test for length equivalence
* @return true if elt has the same length as either rec1 or rec2
*/
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) {
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
}
/**
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
* common contigs in both dictionaries, sorting these according to their indices, and the walking through

View File

@ -187,16 +187,26 @@ public class UserException extends ReviewedStingException {
super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s",
name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2)));
}
}
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
int sequenceRecordIndex = 0;
for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
return Arrays.deepToString(sequenceRecordNames);
public static class LexicographicallySortedSequenceDictionary extends UserException {
public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) {
super(String.format("Lexicographically sorted human genome sequence detected in %s."
+ "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs."
+ "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files"
+ "\n %s contigs = %s",
name, name, prettyPrintSequenceRecords(dict)));
}
}
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
int sequenceRecordIndex = 0;
for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
return Arrays.deepToString(sequenceRecordNames);
}
public static class MissingWalker extends UserException {
public MissingWalker(String walkerName, String message) {
super(String.format("Walker %s is not available: %s", walkerName, message));

View File

@ -43,35 +43,43 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest {
@Test public void fail1() { executeTest("b36xhg18", testVCF(b36KGReference, callsHG18)); }
@Test public void fail2() { executeTest("b37xhg18", testVCF(b37KGReference, callsHG18)); }
@Test public void fail3() { executeTest("hg19xhg18", testVCF(hg19Reference, callsHG18)); }
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18)); }
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18, UserException.LexicographicallySortedSequenceDictionary.class)); }
// testing b36 calls
@Test public void fail5() { executeTest("b36xb36", testVCF(hg18Reference, callsB36)); }
@Test public void fail6() { executeTest("b37xb36", testVCF(b37KGReference, callsB36)); }
@Test public void fail7() { executeTest("hg19xb36", testVCF(hg19Reference, callsB36)); }
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36)); }
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36, UserException.LexicographicallySortedSequenceDictionary.class)); }
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf) {
return testVCF(ref, vcf, UserException.IncompatibleSequenceDictionaries.class);
}
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf, Class c) {
return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
+ vcf + " -F POS,CHROM -R "
+ ref + " -o %s",
1, UserException.IncompatibleSequenceDictionaries.class);
1, c);
}
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM)); }
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM)); }
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class);
}
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) {
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
1, UserException.IncompatibleSequenceDictionaries.class);
1, c);
}
}