GATK now detects and UserExceptions when human lexicographically sorted data is provided
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4343 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
67bcf3a7e4
commit
745b8cc6d3
|
|
@ -50,6 +50,22 @@ import java.util.*;
|
|||
* is enabled before it blows up.
|
||||
*/
|
||||
public class SequenceDictionaryUtils {
|
||||
//
|
||||
// for detecting lexicographically sorted human references
|
||||
//
|
||||
|
||||
private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
||||
|
||||
// hg18
|
||||
private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
||||
private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
||||
private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
||||
|
||||
// hg19
|
||||
private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
||||
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
||||
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
||||
|
||||
public enum SequenceDictionaryCompatability {
|
||||
IDENTICAL, // the dictionaries are identical
|
||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||
|
|
@ -87,6 +103,7 @@ public class SequenceDictionaryUtils {
|
|||
return;
|
||||
case NO_COMMON_CONTIGS:
|
||||
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
||||
|
||||
case UNEQUAL_COMMON_CONTIGS: {
|
||||
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||
SAMSequenceRecord elt1 = x.get(0);
|
||||
|
|
@ -106,9 +123,12 @@ public class SequenceDictionaryUtils {
|
|||
}
|
||||
|
||||
case NON_CANONICAL_HUMAN_ORDER: {
|
||||
UserException ex = new UserException.IncompatibleSequenceDictionaries("Human genome sequence provided in non-canonical ordering. For safety's sake the GATK requires contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs",
|
||||
name1, dict1, name2, dict2);
|
||||
|
||||
UserException ex;
|
||||
if ( nonCanonicalHumanContigOrder(dict1) )
|
||||
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
|
||||
else
|
||||
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
|
||||
|
||||
if ( allowNonFatalIncompabilities() )
|
||||
logger.warn(ex.getMessage());
|
||||
else
|
||||
|
|
@ -136,14 +156,15 @@ public class SequenceDictionaryUtils {
|
|||
*/
|
||||
public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
||||
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
|
||||
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
|
||||
|
||||
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||
|
||||
if (commonContigs.size() == 0)
|
||||
return SequenceDictionaryCompatability.NO_COMMON_CONTIGS;
|
||||
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
|
||||
return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS;
|
||||
else if ( nonCanonicalHumanContigOrder( commonContigs, dict1, dict2 ) )
|
||||
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
|
||||
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
|
||||
return SequenceDictionaryCompatability.OUT_OF_ORDER;
|
||||
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
||||
|
|
@ -220,19 +241,45 @@ public class SequenceDictionaryUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Placeholder for function that determines if the dicts come from the human genome that's been sorted in a
|
||||
* non-canonical order. Returns just returns false (function not enabled).
|
||||
* A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's
|
||||
* lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these
|
||||
* are all matched, requiring that the order be chr1, chr2, chr10.
|
||||
*
|
||||
* @param commonContigs
|
||||
* @param dict1
|
||||
* @param dict2
|
||||
* @param dict
|
||||
* @return
|
||||
*/
|
||||
private static boolean nonCanonicalHumanContigOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
// todo -- implement me if we decide to detect this case
|
||||
return false;
|
||||
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
|
||||
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
|
||||
return false;
|
||||
|
||||
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
|
||||
|
||||
for ( SAMSequenceRecord elt : dict.getSequences() ) {
|
||||
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
|
||||
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
|
||||
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
|
||||
}
|
||||
|
||||
if ( chr1 != null && chr2 != null && chr10 != null) {
|
||||
// we found them all
|
||||
return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() );
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trivial helper that returns true if elt has the same length as rec1 or rec2
|
||||
* @param elt record to test
|
||||
* @param rec1 first record to test for length equivalence
|
||||
* @param rec2 first record to test for length equivalence
|
||||
* @return true if elt has the same length as either rec1 or rec2
|
||||
*/
|
||||
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) {
|
||||
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
|
||||
* common contigs in both dictionaries, sorting these according to their indices, and the walking through
|
||||
|
|
|
|||
|
|
@ -187,16 +187,26 @@ public class UserException extends ReviewedStingException {
|
|||
super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s",
|
||||
name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2)));
|
||||
}
|
||||
}
|
||||
|
||||
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
|
||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||
int sequenceRecordIndex = 0;
|
||||
for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
|
||||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||
return Arrays.deepToString(sequenceRecordNames);
|
||||
public static class LexicographicallySortedSequenceDictionary extends UserException {
|
||||
public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) {
|
||||
super(String.format("Lexicographically sorted human genome sequence detected in %s."
|
||||
+ "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs."
|
||||
+ "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files"
|
||||
+ "\n %s contigs = %s",
|
||||
name, name, prettyPrintSequenceRecords(dict)));
|
||||
}
|
||||
}
|
||||
|
||||
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
|
||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||
int sequenceRecordIndex = 0;
|
||||
for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
|
||||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||
return Arrays.deepToString(sequenceRecordNames);
|
||||
}
|
||||
|
||||
public static class MissingWalker extends UserException {
|
||||
public MissingWalker(String walkerName, String message) {
|
||||
super(String.format("Walker %s is not available: %s", walkerName, message));
|
||||
|
|
|
|||
|
|
@ -43,35 +43,43 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest {
|
|||
@Test public void fail1() { executeTest("b36xhg18", testVCF(b36KGReference, callsHG18)); }
|
||||
@Test public void fail2() { executeTest("b37xhg18", testVCF(b37KGReference, callsHG18)); }
|
||||
@Test public void fail3() { executeTest("hg19xhg18", testVCF(hg19Reference, callsHG18)); }
|
||||
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18)); }
|
||||
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
// testing b36 calls
|
||||
@Test public void fail5() { executeTest("b36xb36", testVCF(hg18Reference, callsB36)); }
|
||||
@Test public void fail6() { executeTest("b37xb36", testVCF(b37KGReference, callsB36)); }
|
||||
@Test public void fail7() { executeTest("hg19xb36", testVCF(hg19Reference, callsB36)); }
|
||||
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36)); }
|
||||
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf) {
|
||||
return testVCF(ref, vcf, UserException.IncompatibleSequenceDictionaries.class);
|
||||
}
|
||||
|
||||
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf, Class c) {
|
||||
return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
|
||||
+ vcf + " -F POS,CHROM -R "
|
||||
+ ref + " -o %s",
|
||||
1, UserException.IncompatibleSequenceDictionaries.class);
|
||||
1, c);
|
||||
|
||||
}
|
||||
|
||||
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
|
||||
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
|
||||
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
|
||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM)); }
|
||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
|
||||
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
|
||||
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
|
||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM)); }
|
||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||
|
||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
|
||||
return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class);
|
||||
}
|
||||
|
||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) {
|
||||
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
|
||||
1, UserException.IncompatibleSequenceDictionaries.class);
|
||||
1, c);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue