Fix for GSA-90: System isn't failing with an error when you use the wrong reference.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1225 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-07-13 20:42:12 +00:00
parent 52659d02d4
commit b18caa2052
2 changed files with 66 additions and 3 deletions

View File

@ -26,6 +26,9 @@
package org.broadinstitute.sting.gatk.executive;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
@ -41,12 +44,12 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
/**
@ -114,6 +117,8 @@ public abstract class MicroScheduler {
this.reads = getReadsDataSource(reads);
this.reference = openReferenceSequenceFile(refFile);
this.rods = getReferenceOrderedDataSources(rods);
validate(this.reads,this.reference);
}
/**
@ -277,4 +282,62 @@ public abstract class MicroScheduler {
GenomeLocParser.setupRefContigOrdering(ref);
return ref;
}
/**
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference.
* TODO: Doing this in the MicroScheduler is a bit late, but this is where data sources are initialized.
* TODO: Move the initialization of data sources back to the GenomeAnalysisEngine.
* @param reads Reads data source.
* @param reference Reference data source.
*/
private void validate( SAMDataSource reads, ReferenceSequenceFile reference ) {
if( reads == null || reference == null )
return;
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
Set<String> readsSequenceNames = new TreeSet<String>();
for( SAMSequenceRecord dictionaryEntry: readsDictionary.getSequences() )
readsSequenceNames.add(dictionaryEntry.getSequenceName());
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary();
Set<String> referenceSequenceNames = new TreeSet<String>();
for( SAMSequenceRecord dictionaryEntry: referenceDictionary.getSequences() )
referenceSequenceNames.add(dictionaryEntry.getSequenceName());
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
Set<String> intersectingSequenceNames = new HashSet<String>(readsSequenceNames);
intersectingSequenceNames.retainAll(referenceSequenceNames);
if( intersectingSequenceNames.size() == 0 ) {
StringBuilder error = new StringBuilder();
error.append("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
error.append(System.getProperty("line.separator"));
error.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
error.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
logger.error(error.toString());
Utils.scareUser("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference.");
}
// If one data source isn't a strict subset of the other or the match is of fewer than x% of the smaller,
// issue a warning.
if( !readsSequenceNames.equals(referenceSequenceNames) ) {
StringBuilder warning = new StringBuilder();
warning.append("Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
warning.append(System.getProperty("line.separator"));
warning.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
warning.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
logger.warn(warning.toString());
}
}
private String prettyPrintSequenceRecords( SAMSequenceDictionary sequenceDictionary ) {
String[] sequenceRecordNames = new String[ sequenceDictionary.size() ];
int sequenceRecordIndex = 0;
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() )
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
return Arrays.deepToString(sequenceRecordNames);
}
}

View File

@ -40,7 +40,7 @@ public class Utils {
//System.out.printf("* %s%n", msg);
//System.out.printf("********************************************************************************%n");
logger.fatal(msg);
throw new RuntimeException(msg);
throw new StingException(msg);
}
public static <T> List<T> cons(final T elt, final List<T> l) {