Lots of error checking added, fixed bugs associated with reading files out of order, added support for U (unsafe) flag for processing reads

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@75 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-16 23:22:04 +00:00
parent 36b8b34490
commit 9ae551e858
10 changed files with 87 additions and 18 deletions

View File

@ -25,6 +25,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
@Option(shortName="T", doc="Type of analysis to run") public String Analysis_Name = null; @Option(shortName="T", doc="Type of analysis to run") public String Analysis_Name = null;
@Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null; @Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null;
@Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false"; @Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false";
@Option(shortName="U", doc="If true, enables unsafe operations, nothing will be checked at runtime. You better know what you are doing if you set this flag.", optional=false) public String UNSAFE = "false";
public static HashMap<String, Object> MODULES = new HashMap<String,Object>(); public static HashMap<String, Object> MODULES = new HashMap<String,Object>();
public static void addModule(final String name, final Object walker) { public static void addModule(final String name, final Object walker) {
@ -100,6 +101,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
engine.setLocation(REGION_STR); engine.setLocation(REGION_STR);
} }
engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true"));
engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true")); engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true"));
//engine.testReference(); //engine.testReference();

View File

@ -50,9 +50,13 @@ public class TraversalEngine {
// Name of the reads file, in BAM/SAM format // Name of the reads file, in BAM/SAM format
private File readsFile = null; // the name of the reads file private File readsFile = null; // the name of the reads file
private SAMFileReader samReader = null;
// iterator over the sam records in the readsFile // iterator over the sam records in the readsFile
private Iterator<SAMRecord> samReadIter = null; private Iterator<SAMRecord> samReadIter = null;
// The verifying iterator, it does checking
VerifyingSamIterator verifyingSamReadIter = null;
// The reference data -- filename, refSeqFile, and iterator // The reference data -- filename, refSeqFile, and iterator
private File refFileName = null; // the name of the reference file private File refFileName = null; // the name of the reference file
private ReferenceSequenceFile refFile = null; private ReferenceSequenceFile refFile = null;
@ -72,6 +76,7 @@ public class TraversalEngine {
private FileProgressTracker samReadingTracker = null; private FileProgressTracker samReadingTracker = null;
public boolean DEBUGGING = false; public boolean DEBUGGING = false;
public boolean beSafeP = true;
public long N_RECORDS_TO_PRINT = 100000; public long N_RECORDS_TO_PRINT = 100000;
public int THREADED_IO_BUFFER_SIZE = 10000; public int THREADED_IO_BUFFER_SIZE = 10000;
@ -107,6 +112,11 @@ public class TraversalEngine {
public void setStrictness( final ValidationStringency s ) { strictness = s; } public void setStrictness( final ValidationStringency s ) { strictness = s; }
public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; } public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; }
public void setDebugging( final boolean d ) { DEBUGGING = d; } public void setDebugging( final boolean d ) { DEBUGGING = d; }
public void setSafetyChecking( final boolean beSafeP ) {
if ( ! beSafeP )
System.out.printf("*** Turning off safety checking, I hope you know what you are doing...%n");
this.beSafeP = beSafeP;
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
@ -284,14 +294,19 @@ public class TraversalEngine {
final FileInputStream samFileStream = new FileInputStream(readsFile); final FileInputStream samFileStream = new FileInputStream(readsFile);
final InputStream bufferedStream= new BufferedInputStream(samFileStream); final InputStream bufferedStream= new BufferedInputStream(samFileStream);
//final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000); //final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000);
final SAMFileReader samReader = new SAMFileReader(bufferedStream, true); samReader = new SAMFileReader(bufferedStream, true);
samReader.setValidationStringency(strictness); samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader(); final SAMFileHeader header = samReader.getFileHeader();
System.err.println("Sort order is: " + header.getSortOrder()); System.err.println("Sort order is: " + header.getSortOrder());
samReadingTracker = new FileProgressTracker<SAMRecord>( readsFile, samReader.iterator(), samFileStream.getChannel(), 1000 ); samReadingTracker = new FileProgressTracker<SAMRecord>( readsFile, samReader.iterator(), samFileStream.getChannel(), 1000 );
samReadIter = new VerifyingSamIterator(samReadingTracker); if ( beSafeP ) {
verifyingSamReadIter = new VerifyingSamIterator(samReadingTracker);
samReadIter = verifyingSamReadIter;
} else {
samReadIter = samReadingTracker;
}
if ( THREADED_IO ) { if ( THREADED_IO ) {
System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE); System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE);
@ -310,12 +325,12 @@ public class TraversalEngine {
* *
*/ */
protected void initializeReference() { protected void initializeReference() {
if ( refFileName!= null ) { if ( refFileName != null ) {
this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName); this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this.refIter = new ReferenceIterator(this.refFile); this.refIter = new ReferenceIterator(this.refFile);
if ( ! Utils.setupRefContigOrdering(this.refFile) ) { if ( ! Utils.setupRefContigOrdering(this.refFile) ) {
// We couldn't process the reference contig ordering, fail since we need it // We couldn't process the reference contig ordering, fail since we need it
throw new RuntimeException("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down."); Utils.scareUser(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
} }
} }
} }
@ -413,6 +428,16 @@ public class TraversalEngine {
} }
} }
public void verifySortOrder(final boolean requiresSortedOrder) {
if ( beSafeP && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) {
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing";
if ( requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT )
throw new RuntimeIOException(msg);
else if ( strictness == SAMFileReader.ValidationStringency.LENIENT )
Utils.warnUser(msg);
}
}
/** /**
* Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and * Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and
* the reference base for each locus in the reference to the LocusWalker walker. Supports all of the * the reference base for each locus in the reference to the LocusWalker walker. Supports all of the
@ -424,6 +449,8 @@ public class TraversalEngine {
* @return 0 on success * @return 0 on success
*/ */
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) { protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
verifySortOrder(true);
// prepare the read filtering read iterator and provide it to a new locus iterator // prepare the read filtering read iterator and provide it to a new locus iterator
FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc()); FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc());
//LocusIterator iter = new SingleLocusIterator(filterIter); //LocusIterator iter = new SingleLocusIterator(filterIter);
@ -490,13 +517,19 @@ public class TraversalEngine {
* Traverse by read -- the key driver of linearly ordered traversal of reads. Provides a single read to * Traverse by read -- the key driver of linearly ordered traversal of reads. Provides a single read to
* the walker object, in coordinate order. Supports all of the * the walker object, in coordinate order. Supports all of the
* interaction contract implied by the read walker * interaction contract implied by the read walker
* * sor
* @param walker A read walker object * @param walker A read walker object
* @param <M> MapType -- the result of calling map() on walker * @param <M> MapType -- the result of calling map() on walker
* @param <T> ReduceType -- the result of calling reduce() on the walker * @param <T> ReduceType -- the result of calling reduce() on the walker
* @return 0 on success * @return 0 on success
*/ */
protected <M,R> int traverseByRead(ReadWalker<M,R> walker) { protected <M,R> int traverseByRead(ReadWalker<M,R> walker) {
if ( refFileName == null && ! walker.requiresOrderedReads() ) {
System.out.println("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.");
verifyingSamReadIter.setCheckOrderP(false);
}
verifySortOrder( refFileName != null || walker.requiresOrderedReads());
// Initialize the walker // Initialize the walker
walker.initialize(); walker.initialize();
@ -515,10 +548,11 @@ public class TraversalEngine {
GenomeLoc loc = Utils.genomicLocationOf(read); GenomeLoc loc = Utils.genomicLocationOf(read);
// Jump forward in the reference to this locus location // Jump forward in the reference to this locus location
final ReferenceIterator refSite = refIter.seekForward(loc);
final char refBase = refSite.getBaseAsChar();
LocusContext locus = new LocusContext(loc, reads, offsets); LocusContext locus = new LocusContext(loc, reads, offsets);
if ( ! loc.isUnmapped() && refIter != null ) {
final ReferenceIterator refSite = refIter.seekForward(loc);
locus.setReferenceContig(refSite.getCurrentContig()); locus.setReferenceContig(refSite.getCurrentContig());
}
if ( inLocations(loc) ) { if ( inLocations(loc) ) {

View File

@ -94,6 +94,8 @@ public class ReferenceIterator implements Iterator<ReferenceIterator> {
// //
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
public ReferenceIterator seekForward(final GenomeLoc loc) { public ReferenceIterator seekForward(final GenomeLoc loc) {
assert loc != null : "seekForward location is null";
return seekForwardOffset(loc.getContig(), loc.getStart() - 1); return seekForwardOffset(loc.getContig(), loc.getStart() - 1);
} }
@ -102,6 +104,9 @@ public class ReferenceIterator implements Iterator<ReferenceIterator> {
} }
private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) { private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) {
assert contigName != null : "contigName is null";
assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset;
// jumps us forward in the sequence to the contig / pos // jumps us forward in the sequence to the contig / pos
if ( currentContig == null ) if ( currentContig == null )
next(); next();

View File

@ -30,12 +30,21 @@ public class VerifyingSamIterator implements Iterator<SAMRecord> {
SAMRecord cur = it.next(); SAMRecord cur = it.next();
if ( last != null ) if ( last != null )
verifyRecord(last, cur); verifyRecord(last, cur);
if ( ! cur.getReadUnmappedFlag() )
last = cur; last = cur;
return cur; return cur;
} }
/**
* If true, enables ordered checking of the reads in the file. By default this is enabled.
* @param checkP If true, sam records will be checked to insure they come in order
*/
public void setCheckOrderP( boolean checkP ) {
checkOrderP = checkP;
}
public void verifyRecord( final SAMRecord last, final SAMRecord cur ) { public void verifyRecord( final SAMRecord last, final SAMRecord cur ) {
if ( checkOrderP ) { if ( checkOrderP && ! cur.getReadUnmappedFlag() ) {
GenomeLoc lastLoc = Utils.genomicLocationOf( last ); GenomeLoc lastLoc = Utils.genomicLocationOf( last );
GenomeLoc curLoc = Utils.genomicLocationOf( cur ); GenomeLoc curLoc = Utils.genomicLocationOf( cur );

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.LocusContext;
* Time: 3:22:14 PM * Time: 3:22:14 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class BaseQualityHistoWalker implements ReadWalker<Integer, Integer> { public class BaseQualityHistoWalker extends BasicReadWalker<Integer, Integer> {
long[] qualCounts = new long[100]; long[] qualCounts = new long[100];
public void initialize() { public void initialize() {
@ -20,8 +20,6 @@ public class BaseQualityHistoWalker implements ReadWalker<Integer, Integer> {
} }
} }
public String walkerType() { return "ByRead"; }
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(LocusContext context, SAMRecord read) { public boolean filter(LocusContext context, SAMRecord read) {
return true; // We are keeping all the reads return true; // We are keeping all the reads

View File

@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker;
public abstract class BasicReadWalker<MapType, ReduceType> implements ReadWalker<MapType, ReduceType> { public abstract class BasicReadWalker<MapType, ReduceType> implements ReadWalker<MapType, ReduceType> {
public void initialize() { } public void initialize() { }
public String walkerType() { return "ByRead"; } public String walkerType() { return "ByRead"; }
public boolean requiresOrderedReads() { return false; }
public boolean filter(LocusContext context, SAMRecord read) { public boolean filter(LocusContext context, SAMRecord read) {
// We are keeping all the reads // We are keeping all the reads

View File

@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.LocusContext;
public interface ReadWalker<MapType, ReduceType> { public interface ReadWalker<MapType, ReduceType> {
void initialize(); void initialize();
public String walkerType(); public String walkerType();
public boolean requiresOrderedReads();
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
boolean filter(LocusContext context, SAMRecord read); boolean filter(LocusContext context, SAMRecord read);

View File

@ -123,6 +123,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
} }
public final boolean isUnmapped() { return this.contig == null; }
public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; }
public final boolean atBeginningOfContigP() { return this.start == 1; } public final boolean atBeginningOfContigP() { return this.start == 1; }

View File

@ -87,7 +87,7 @@ public class RefHanger<T> {
* @return * @return
*/ */
public Hanger popLeft() { public Hanger popLeft() {
assert(hasHangers()); assert hasHangers();
return hangers.remove(0); return hangers.remove(0);
} }
@ -101,7 +101,7 @@ public class RefHanger<T> {
* @return * @return
*/ */
public Hanger getLeft() { public Hanger getLeft() {
assert(hasHangers()); assert hasHangers();
return getHanger(0); return getHanger(0);
} }
@ -111,7 +111,7 @@ public class RefHanger<T> {
* @return * @return
*/ */
public Hanger getHanger(int relativePos) { public Hanger getHanger(int relativePos) {
assert( hangers.contains(relativePos) ); //assert hangers.contains(relativePos) : hangers + " " + relativePos;
return hangers.get(relativePos); return hangers.get(relativePos);
} }
@ -209,7 +209,7 @@ public class RefHanger<T> {
// we have nothing, just push right // we have nothing, just push right
pushRight(pos, datum); pushRight(pos, datum);
else { else {
assert( pos.compareTo(getRightLoc()) == 1 ); //assert pos.compareTo(getRightLoc()) == 1 : pos + " " + getRightLoc() + " => " + pos.compareTo(getRightLoc());
GenomeLoc nextRight = getRightLoc().nextLoc(); GenomeLoc nextRight = getRightLoc().nextLoc();
while ( pos.compareTo(nextRight) == 1 ) { while ( pos.compareTo(nextRight) == 1 ) {

View File

@ -16,6 +16,23 @@ import java.util.*;
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class Utils { public class Utils {
public static void warnUser(final String msg) {
System.out.printf("********************************************************************************%n");
System.out.printf("* WARNING:%n");
System.out.printf("*%n");
System.out.printf("* %s%n", msg);
System.out.printf("********************************************************************************%n");
}
public static void scareUser(final String msg) {
System.out.printf("********************************************************************************%n");
System.out.printf("* ERROR:%n");
System.out.printf("*%n");
System.out.printf("* %s%n", msg);
System.out.printf("********************************************************************************%n");
throw new RuntimeException(msg);
}
public static <T> List<T> filter(Predicate pred, Collection<T> c) { public static <T> List<T> filter(Predicate pred, Collection<T> c) {
List<T> filtered = new ArrayList<T>(); List<T> filtered = new ArrayList<T>();
// loop through all the elements in c // loop through all the elements in c
@ -142,7 +159,7 @@ public class Utils {
} }
GenomeLoc.setContigOrdering(refContigOrdering); GenomeLoc.setContigOrdering(refContigOrdering);
return refContigOrdering != null; return refContigs != null;
} }
// Java Generics can't do primitive types, so I had to do this the simplistic way // Java Generics can't do primitive types, so I had to do this the simplistic way