diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index ce5507386..029ad0c4e 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -41,12 +44,12 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** @@ -114,6 +117,8 @@ public abstract class MicroScheduler { this.reads = getReadsDataSource(reads); this.reference = openReferenceSequenceFile(refFile); this.rods = getReferenceOrderedDataSources(rods); + + validate(this.reads,this.reference); } /** @@ -277,4 +282,62 @@ public abstract class MicroScheduler { GenomeLocParser.setupRefContigOrdering(ref); return ref; } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference. + * TODO: Doing this in the MicroScheduler is a bit late, but this is where data sources are initialized. + * TODO: Move the initialization of data sources back to the GenomeAnalysisEngine. + * @param reads Reads data source. + * @param reference Reference data source. + */ + private void validate( SAMDataSource reads, ReferenceSequenceFile reference ) { + if( reads == null || reference == null ) + return; + + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + Set readsSequenceNames = new TreeSet(); + for( SAMSequenceRecord dictionaryEntry: readsDictionary.getSequences() ) + readsSequenceNames.add(dictionaryEntry.getSequenceName()); + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + Set referenceSequenceNames = new TreeSet(); + for( SAMSequenceRecord dictionaryEntry: referenceDictionary.getSequences() ) + referenceSequenceNames.add(dictionaryEntry.getSequenceName()); + + // If there's no overlap between reads and reference, data will be bogus. Throw an exception. + Set intersectingSequenceNames = new HashSet(readsSequenceNames); + intersectingSequenceNames.retainAll(referenceSequenceNames); + if( intersectingSequenceNames.size() == 0 ) { + StringBuilder error = new StringBuilder(); + error.append("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); + error.append(System.getProperty("line.separator")); + error.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary))); + error.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary))); + logger.error(error.toString()); + Utils.scareUser("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference."); + } + + // If one data source isn't a strict subset of the other or the match is of fewer than x% of the smaller, + // issue a warning. + if( !readsSequenceNames.equals(referenceSequenceNames) ) { + StringBuilder warning = new StringBuilder(); + warning.append("Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); + warning.append(System.getProperty("line.separator")); + warning.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary))); + warning.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary))); + logger.warn(warning.toString()); + } + } + + private String prettyPrintSequenceRecords( SAMSequenceDictionary sequenceDictionary ) { + String[] sequenceRecordNames = new String[ sequenceDictionary.size() ]; + int sequenceRecordIndex = 0; + for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) + sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); + return Arrays.deepToString(sequenceRecordNames); + } } diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index a924e4d3c..66817cb2c 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -40,7 +40,7 @@ public class Utils { //System.out.printf("* %s%n", msg); //System.out.printf("********************************************************************************%n"); logger.fatal(msg); - throw new RuntimeException(msg); + throw new StingException(msg); } public static List cons(final T elt, final List l) {