Fix for GSA-90: System isn't failing with an error when you use the wrong reference.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1225 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
52659d02d4
commit
b18caa2052
|
|
@ -26,6 +26,9 @@
|
|||
package org.broadinstitute.sting.gatk.executive;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
|
|
@ -41,12 +44,12 @@ import org.broadinstitute.sting.gatk.walkers.*;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -114,6 +117,8 @@ public abstract class MicroScheduler {
|
|||
this.reads = getReadsDataSource(reads);
|
||||
this.reference = openReferenceSequenceFile(refFile);
|
||||
this.rods = getReferenceOrderedDataSources(rods);
|
||||
|
||||
validate(this.reads,this.reference);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -277,4 +282,62 @@ public abstract class MicroScheduler {
|
|||
GenomeLocParser.setupRefContigOrdering(ref);
|
||||
return ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference.
|
||||
* TODO: Doing this in the MicroScheduler is a bit late, but this is where data sources are initialized.
|
||||
* TODO: Move the initialization of data sources back to the GenomeAnalysisEngine.
|
||||
* @param reads Reads data source.
|
||||
* @param reference Reference data source.
|
||||
*/
|
||||
private void validate( SAMDataSource reads, ReferenceSequenceFile reference ) {
|
||||
if( reads == null || reference == null )
|
||||
return;
|
||||
|
||||
// Compile a set of sequence names that exist in the BAM files.
|
||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||
|
||||
Set<String> readsSequenceNames = new TreeSet<String>();
|
||||
for( SAMSequenceRecord dictionaryEntry: readsDictionary.getSequences() )
|
||||
readsSequenceNames.add(dictionaryEntry.getSequenceName());
|
||||
|
||||
// Compile a set of sequence names that exist in the reference file.
|
||||
SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary();
|
||||
|
||||
Set<String> referenceSequenceNames = new TreeSet<String>();
|
||||
for( SAMSequenceRecord dictionaryEntry: referenceDictionary.getSequences() )
|
||||
referenceSequenceNames.add(dictionaryEntry.getSequenceName());
|
||||
|
||||
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
|
||||
Set<String> intersectingSequenceNames = new HashSet<String>(readsSequenceNames);
|
||||
intersectingSequenceNames.retainAll(referenceSequenceNames);
|
||||
if( intersectingSequenceNames.size() == 0 ) {
|
||||
StringBuilder error = new StringBuilder();
|
||||
error.append("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
|
||||
error.append(System.getProperty("line.separator"));
|
||||
error.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
|
||||
error.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
|
||||
logger.error(error.toString());
|
||||
Utils.scareUser("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference.");
|
||||
}
|
||||
|
||||
// If one data source isn't a strict subset of the other or the match is of fewer than x% of the smaller,
|
||||
// issue a warning.
|
||||
if( !readsSequenceNames.equals(referenceSequenceNames) ) {
|
||||
StringBuilder warning = new StringBuilder();
|
||||
warning.append("Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
|
||||
warning.append(System.getProperty("line.separator"));
|
||||
warning.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
|
||||
warning.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
|
||||
logger.warn(warning.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private String prettyPrintSequenceRecords( SAMSequenceDictionary sequenceDictionary ) {
|
||||
String[] sequenceRecordNames = new String[ sequenceDictionary.size() ];
|
||||
int sequenceRecordIndex = 0;
|
||||
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() )
|
||||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||
return Arrays.deepToString(sequenceRecordNames);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ public class Utils {
|
|||
//System.out.printf("* %s%n", msg);
|
||||
//System.out.printf("********************************************************************************%n");
|
||||
logger.fatal(msg);
|
||||
throw new RuntimeException(msg);
|
||||
throw new StingException(msg);
|
||||
}
|
||||
|
||||
public static <T> List<T> cons(final T elt, final List<T> l) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue