GATK now detects and UserExceptions when human lexicographically sorted data is provided
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4343 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
67bcf3a7e4
commit
745b8cc6d3
|
|
@ -50,6 +50,22 @@ import java.util.*;
|
||||||
* is enabled before it blows up.
|
* is enabled before it blows up.
|
||||||
*/
|
*/
|
||||||
public class SequenceDictionaryUtils {
|
public class SequenceDictionaryUtils {
|
||||||
|
//
|
||||||
|
// for detecting lexicographically sorted human references
|
||||||
|
//
|
||||||
|
|
||||||
|
private static boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true;
|
||||||
|
|
||||||
|
// hg18
|
||||||
|
private static SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
|
||||||
|
private static SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
|
||||||
|
private static SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
|
||||||
|
|
||||||
|
// hg19
|
||||||
|
private static SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
|
||||||
|
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
|
||||||
|
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
|
||||||
|
|
||||||
public enum SequenceDictionaryCompatability {
|
public enum SequenceDictionaryCompatability {
|
||||||
IDENTICAL, // the dictionaries are identical
|
IDENTICAL, // the dictionaries are identical
|
||||||
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
COMMON_SUBSET, // there exists a common subset of equivalent contigs
|
||||||
|
|
@ -87,6 +103,7 @@ public class SequenceDictionaryUtils {
|
||||||
return;
|
return;
|
||||||
case NO_COMMON_CONTIGS:
|
case NO_COMMON_CONTIGS:
|
||||||
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
|
||||||
|
|
||||||
case UNEQUAL_COMMON_CONTIGS: {
|
case UNEQUAL_COMMON_CONTIGS: {
|
||||||
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
|
||||||
SAMSequenceRecord elt1 = x.get(0);
|
SAMSequenceRecord elt1 = x.get(0);
|
||||||
|
|
@ -106,8 +123,11 @@ public class SequenceDictionaryUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
case NON_CANONICAL_HUMAN_ORDER: {
|
case NON_CANONICAL_HUMAN_ORDER: {
|
||||||
UserException ex = new UserException.IncompatibleSequenceDictionaries("Human genome sequence provided in non-canonical ordering. For safety's sake the GATK requires contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs",
|
UserException ex;
|
||||||
name1, dict1, name2, dict2);
|
if ( nonCanonicalHumanContigOrder(dict1) )
|
||||||
|
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
|
||||||
|
else
|
||||||
|
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
|
||||||
|
|
||||||
if ( allowNonFatalIncompabilities() )
|
if ( allowNonFatalIncompabilities() )
|
||||||
logger.warn(ex.getMessage());
|
logger.warn(ex.getMessage());
|
||||||
|
|
@ -136,14 +156,15 @@ public class SequenceDictionaryUtils {
|
||||||
*/
|
*/
|
||||||
public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||||
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
||||||
|
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
|
||||||
|
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
|
||||||
|
|
||||||
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
|
||||||
|
|
||||||
if (commonContigs.size() == 0)
|
if (commonContigs.size() == 0)
|
||||||
return SequenceDictionaryCompatability.NO_COMMON_CONTIGS;
|
return SequenceDictionaryCompatability.NO_COMMON_CONTIGS;
|
||||||
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
|
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
|
||||||
return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS;
|
return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS;
|
||||||
else if ( nonCanonicalHumanContigOrder( commonContigs, dict1, dict2 ) )
|
|
||||||
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
|
|
||||||
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
|
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
|
||||||
return SequenceDictionaryCompatability.OUT_OF_ORDER;
|
return SequenceDictionaryCompatability.OUT_OF_ORDER;
|
||||||
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
|
||||||
|
|
@ -220,18 +241,44 @@ public class SequenceDictionaryUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Placeholder for function that determines if the dicts come from the human genome that's been sorted in a
|
* A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's
|
||||||
* non-canonical order. Returns just returns false (function not enabled).
|
* lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these
|
||||||
|
* are all matched, requiring that the order be chr1, chr2, chr10.
|
||||||
*
|
*
|
||||||
* @param commonContigs
|
* @param dict
|
||||||
* @param dict1
|
|
||||||
* @param dict2
|
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static boolean nonCanonicalHumanContigOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
|
||||||
// todo -- implement me if we decide to detect this case
|
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
|
||||||
|
return false;
|
||||||
|
|
||||||
|
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
|
||||||
|
|
||||||
|
for ( SAMSequenceRecord elt : dict.getSequences() ) {
|
||||||
|
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
|
||||||
|
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
|
||||||
|
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( chr1 != null && chr2 != null && chr10 != null) {
|
||||||
|
// we found them all
|
||||||
|
return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() );
|
||||||
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Trivial helper that returns true if elt has the same length as rec1 or rec2
|
||||||
|
* @param elt record to test
|
||||||
|
* @param rec1 first record to test for length equivalence
|
||||||
|
* @param rec2 first record to test for length equivalence
|
||||||
|
* @return true if elt has the same length as either rec1 or rec2
|
||||||
|
*/
|
||||||
|
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) {
|
||||||
|
return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
|
* Returns true if the common contigs in dict1 and dict2 are in the same order. This is accomplished by getting the
|
||||||
|
|
|
||||||
|
|
@ -187,6 +187,17 @@ public class UserException extends ReviewedStingException {
|
||||||
super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s",
|
super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s",
|
||||||
name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2)));
|
name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2)));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class LexicographicallySortedSequenceDictionary extends UserException {
|
||||||
|
public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) {
|
||||||
|
super(String.format("Lexicographically sorted human genome sequence detected in %s."
|
||||||
|
+ "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs."
|
||||||
|
+ "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files"
|
||||||
|
+ "\n %s contigs = %s",
|
||||||
|
name, name, prettyPrintSequenceRecords(dict)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
|
private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
|
||||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||||
|
|
@ -195,7 +206,6 @@ public class UserException extends ReviewedStingException {
|
||||||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||||
return Arrays.deepToString(sequenceRecordNames);
|
return Arrays.deepToString(sequenceRecordNames);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public static class MissingWalker extends UserException {
|
public static class MissingWalker extends UserException {
|
||||||
public MissingWalker(String walkerName, String message) {
|
public MissingWalker(String walkerName, String message) {
|
||||||
|
|
|
||||||
|
|
@ -43,35 +43,43 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest {
|
||||||
@Test public void fail1() { executeTest("b36xhg18", testVCF(b36KGReference, callsHG18)); }
|
@Test public void fail1() { executeTest("b36xhg18", testVCF(b36KGReference, callsHG18)); }
|
||||||
@Test public void fail2() { executeTest("b37xhg18", testVCF(b37KGReference, callsHG18)); }
|
@Test public void fail2() { executeTest("b37xhg18", testVCF(b37KGReference, callsHG18)); }
|
||||||
@Test public void fail3() { executeTest("hg19xhg18", testVCF(hg19Reference, callsHG18)); }
|
@Test public void fail3() { executeTest("hg19xhg18", testVCF(hg19Reference, callsHG18)); }
|
||||||
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18)); }
|
@Test public void fail4() { executeTest("hg18lex-v-hg18", testVCF(lexHG18, callsHG18, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
// testing b36 calls
|
// testing b36 calls
|
||||||
@Test public void fail5() { executeTest("b36xb36", testVCF(hg18Reference, callsB36)); }
|
@Test public void fail5() { executeTest("b36xb36", testVCF(hg18Reference, callsB36)); }
|
||||||
@Test public void fail6() { executeTest("b37xb36", testVCF(b37KGReference, callsB36)); }
|
@Test public void fail6() { executeTest("b37xb36", testVCF(b37KGReference, callsB36)); }
|
||||||
@Test public void fail7() { executeTest("hg19xb36", testVCF(hg19Reference, callsB36)); }
|
@Test public void fail7() { executeTest("hg19xb36", testVCF(hg19Reference, callsB36)); }
|
||||||
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36)); }
|
@Test public void fail8() { executeTest("hg18lex-v-b36", testVCF(lexHG18, callsB36, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf) {
|
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf) {
|
||||||
|
return testVCF(ref, vcf, UserException.IncompatibleSequenceDictionaries.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf, Class c) {
|
||||||
return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
|
return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
|
||||||
+ vcf + " -F POS,CHROM -R "
|
+ vcf + " -F POS,CHROM -R "
|
||||||
+ ref + " -o %s",
|
+ ref + " -o %s",
|
||||||
1, UserException.IncompatibleSequenceDictionaries.class);
|
1, c);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
|
@Test public void failBAM1() { executeTest("b36bam-v-b37", testBAM(b37KGReference, b36BAM)); }
|
||||||
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
|
@Test public void failBAM2() { executeTest("b36bam-v-hg18", testBAM(hg18Reference, b36BAM)); }
|
||||||
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
|
@Test public void failBAM3() { executeTest("b36bam-v-hg19", testBAM(hg19Reference, b36BAM)); }
|
||||||
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM)); }
|
@Test public void failBAM4() { executeTest("b36bam-v-lexhg18", testBAM(lexHG18, b36BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
|
@Test public void failBAM5() { executeTest("hg18bam-v-b36", testBAM(b36KGReference, hg18BAM)); }
|
||||||
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
|
@Test public void failBAM6() { executeTest("hg18bam-v-b37", testBAM(b37KGReference, hg18BAM)); }
|
||||||
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
|
@Test public void failBAM7() { executeTest("hg18bam-v-hg19", testBAM(hg19Reference, hg18BAM)); }
|
||||||
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM)); }
|
@Test public void failBAM8() { executeTest("hg18bam-v-lexhg18", testBAM(lexHG18, hg18BAM, UserException.LexicographicallySortedSequenceDictionary.class)); }
|
||||||
|
|
||||||
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
|
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam) {
|
||||||
|
return testBAM(ref, bam, UserException.IncompatibleSequenceDictionaries.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private WalkerTest.WalkerTestSpec testBAM(String ref, String bam, Class c) {
|
||||||
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
|
return new WalkerTest.WalkerTestSpec("-T UnifiedGenotyper -I " + bam + " -R " + ref + " -L 1:10,000,000-11,000,000 -o %s",
|
||||||
1, UserException.IncompatibleSequenceDictionaries.class);
|
1, c);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue