From 9ae551e858e7bd20659845313f41e2b49ac53c29 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 16 Mar 2009 23:22:04 +0000 Subject: [PATCH] Lots of error checking added, fixed bugs associated with reading files out of order, added support for U (unsafe) flag for processing reads git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@75 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 3 ++ .../sting/gatk/TraversalEngine.java | 50 ++++++++++++++++--- .../gatk/iterators/ReferenceIterator.java | 5 ++ .../gatk/iterators/VerifyingSamIterator.java | 13 ++++- .../gatk/walkers/BaseQualityHistoWalker.java | 4 +- .../sting/gatk/walkers/BasicReadWalker.java | 1 + .../sting/gatk/walkers/ReadWalker.java | 1 + .../broadinstitute/sting/utils/GenomeLoc.java | 1 + .../broadinstitute/sting/utils/RefHanger.java | 8 +-- .../org/broadinstitute/sting/utils/Utils.java | 19 ++++++- 10 files changed, 87 insertions(+), 18 deletions(-) diff --git a/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 134d39816..59dec1762 100644 --- a/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -25,6 +25,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { @Option(shortName="T", doc="Type of analysis to run") public String Analysis_Name = null; @Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null; @Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false"; + @Option(shortName="U", doc="If true, enables unsafe operations, nothing will be checked at runtime. You better know what you are doing if you set this flag.", optional=false) public String UNSAFE = "false"; public static HashMap MODULES = new HashMap(); public static void addModule(final String name, final Object walker) { @@ -100,6 +101,8 @@ public class GenomeAnalysisTK extends CommandLineProgram { engine.setLocation(REGION_STR); } + engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true")); + engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true")); //engine.testReference(); diff --git a/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index 1c2a71580..aa434ae7f 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -50,9 +50,13 @@ public class TraversalEngine { // Name of the reads file, in BAM/SAM format private File readsFile = null; // the name of the reads file + private SAMFileReader samReader = null; // iterator over the sam records in the readsFile private Iterator samReadIter = null; + // The verifying iterator, it does checking + VerifyingSamIterator verifyingSamReadIter = null; + // The reference data -- filename, refSeqFile, and iterator private File refFileName = null; // the name of the reference file private ReferenceSequenceFile refFile = null; @@ -72,6 +76,7 @@ public class TraversalEngine { private FileProgressTracker samReadingTracker = null; public boolean DEBUGGING = false; + public boolean beSafeP = true; public long N_RECORDS_TO_PRINT = 100000; public int THREADED_IO_BUFFER_SIZE = 10000; @@ -107,6 +112,11 @@ public class TraversalEngine { public void setStrictness( final ValidationStringency s ) { strictness = s; } public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; } public void setDebugging( final boolean d ) { DEBUGGING = d; } + public void setSafetyChecking( final boolean beSafeP ) { + if ( ! beSafeP ) + System.out.printf("*** Turning off safety checking, I hope you know what you are doing...%n"); + this.beSafeP = beSafeP; + } // -------------------------------------------------------------------------------------------------------------- // @@ -284,14 +294,19 @@ public class TraversalEngine { final FileInputStream samFileStream = new FileInputStream(readsFile); final InputStream bufferedStream= new BufferedInputStream(samFileStream); //final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000); - final SAMFileReader samReader = new SAMFileReader(bufferedStream, true); + samReader = new SAMFileReader(bufferedStream, true); samReader.setValidationStringency(strictness); final SAMFileHeader header = samReader.getFileHeader(); System.err.println("Sort order is: " + header.getSortOrder()); samReadingTracker = new FileProgressTracker( readsFile, samReader.iterator(), samFileStream.getChannel(), 1000 ); - samReadIter = new VerifyingSamIterator(samReadingTracker); + if ( beSafeP ) { + verifyingSamReadIter = new VerifyingSamIterator(samReadingTracker); + samReadIter = verifyingSamReadIter; + } else { + samReadIter = samReadingTracker; + } if ( THREADED_IO ) { System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE); @@ -310,12 +325,12 @@ public class TraversalEngine { * */ protected void initializeReference() { - if ( refFileName!= null ) { + if ( refFileName != null ) { this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName); this.refIter = new ReferenceIterator(this.refFile); if ( ! Utils.setupRefContigOrdering(this.refFile) ) { // We couldn't process the reference contig ordering, fail since we need it - throw new RuntimeException("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down."); + Utils.scareUser(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName)); } } } @@ -413,6 +428,16 @@ public class TraversalEngine { } } + public void verifySortOrder(final boolean requiresSortedOrder) { + if ( beSafeP && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) { + final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing"; + if ( requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT ) + throw new RuntimeIOException(msg); + else if ( strictness == SAMFileReader.ValidationStringency.LENIENT ) + Utils.warnUser(msg); + } + } + /** * Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and * the reference base for each locus in the reference to the LocusWalker walker. Supports all of the @@ -424,6 +449,8 @@ public class TraversalEngine { * @return 0 on success */ protected int traverseByLoci(LocusWalker walker) { + verifySortOrder(true); + // prepare the read filtering read iterator and provide it to a new locus iterator FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc()); //LocusIterator iter = new SingleLocusIterator(filterIter); @@ -490,13 +517,19 @@ public class TraversalEngine { * Traverse by read -- the key driver of linearly ordered traversal of reads. Provides a single read to * the walker object, in coordinate order. Supports all of the * interaction contract implied by the read walker - * + * sor * @param walker A read walker object * @param MapType -- the result of calling map() on walker * @param ReduceType -- the result of calling reduce() on the walker * @return 0 on success */ protected int traverseByRead(ReadWalker walker) { + if ( refFileName == null && ! walker.requiresOrderedReads() ) { + System.out.println("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing."); + verifyingSamReadIter.setCheckOrderP(false); + } + + verifySortOrder( refFileName != null || walker.requiresOrderedReads()); // Initialize the walker walker.initialize(); @@ -515,10 +548,11 @@ public class TraversalEngine { GenomeLoc loc = Utils.genomicLocationOf(read); // Jump forward in the reference to this locus location - final ReferenceIterator refSite = refIter.seekForward(loc); - final char refBase = refSite.getBaseAsChar(); LocusContext locus = new LocusContext(loc, reads, offsets); - locus.setReferenceContig(refSite.getCurrentContig()); + if ( ! loc.isUnmapped() && refIter != null ) { + final ReferenceIterator refSite = refIter.seekForward(loc); + locus.setReferenceContig(refSite.getCurrentContig()); + } if ( inLocations(loc) ) { diff --git a/playground/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java b/playground/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java index c69a5e926..f4d3038b2 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java @@ -94,6 +94,8 @@ public class ReferenceIterator implements Iterator { // // -------------------------------------------------------------------------------------------------------------- public ReferenceIterator seekForward(final GenomeLoc loc) { + assert loc != null : "seekForward location is null"; + return seekForwardOffset(loc.getContig(), loc.getStart() - 1); } @@ -102,6 +104,9 @@ public class ReferenceIterator implements Iterator { } private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) { + assert contigName != null : "contigName is null"; + assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset; + // jumps us forward in the sequence to the contig / pos if ( currentContig == null ) next(); diff --git a/playground/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/playground/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 81ab4d047..54dbe3101 100644 --- a/playground/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -30,12 +30,21 @@ public class VerifyingSamIterator implements Iterator { SAMRecord cur = it.next(); if ( last != null ) verifyRecord(last, cur); - last = cur; + if ( ! cur.getReadUnmappedFlag() ) + last = cur; return cur; } + /** + * If true, enables ordered checking of the reads in the file. By default this is enabled. + * @param checkP If true, sam records will be checked to insure they come in order + */ + public void setCheckOrderP( boolean checkP ) { + checkOrderP = checkP; + } + public void verifyRecord( final SAMRecord last, final SAMRecord cur ) { - if ( checkOrderP ) { + if ( checkOrderP && ! cur.getReadUnmappedFlag() ) { GenomeLoc lastLoc = Utils.genomicLocationOf( last ); GenomeLoc curLoc = Utils.genomicLocationOf( cur ); diff --git a/playground/java/src/org/broadinstitute/sting/gatk/walkers/BaseQualityHistoWalker.java b/playground/java/src/org/broadinstitute/sting/gatk/walkers/BaseQualityHistoWalker.java index 215af2871..569a172ed 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/walkers/BaseQualityHistoWalker.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/walkers/BaseQualityHistoWalker.java @@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.LocusContext; * Time: 3:22:14 PM * To change this template use File | Settings | File Templates. */ -public class BaseQualityHistoWalker implements ReadWalker { +public class BaseQualityHistoWalker extends BasicReadWalker { long[] qualCounts = new long[100]; public void initialize() { @@ -20,8 +20,6 @@ public class BaseQualityHistoWalker implements ReadWalker { } } - public String walkerType() { return "ByRead"; } - // Do we actually want to operate on the context? public boolean filter(LocusContext context, SAMRecord read) { return true; // We are keeping all the reads diff --git a/playground/java/src/org/broadinstitute/sting/gatk/walkers/BasicReadWalker.java b/playground/java/src/org/broadinstitute/sting/gatk/walkers/BasicReadWalker.java index 209aa6773..fc7e9448d 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/walkers/BasicReadWalker.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/walkers/BasicReadWalker.java @@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker; public abstract class BasicReadWalker implements ReadWalker { public void initialize() { } public String walkerType() { return "ByRead"; } + public boolean requiresOrderedReads() { return false; } public boolean filter(LocusContext context, SAMRecord read) { // We are keeping all the reads diff --git a/playground/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/playground/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 14cecfa96..7ee90c6c5 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.LocusContext; public interface ReadWalker { void initialize(); public String walkerType(); + public boolean requiresOrderedReads(); // Do we actually want to operate on the context? boolean filter(LocusContext context, SAMRecord read); diff --git a/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index d263d33a9..a88c93c14 100644 --- a/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -123,6 +123,7 @@ public class GenomeLoc implements Comparable { } + public final boolean isUnmapped() { return this.contig == null; } public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } public final boolean atBeginningOfContigP() { return this.start == 1; } diff --git a/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java b/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java index 01f89f0a4..e6203bce1 100755 --- a/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java +++ b/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java @@ -87,7 +87,7 @@ public class RefHanger { * @return */ public Hanger popLeft() { - assert(hasHangers()); + assert hasHangers(); return hangers.remove(0); } @@ -101,7 +101,7 @@ public class RefHanger { * @return */ public Hanger getLeft() { - assert(hasHangers()); + assert hasHangers(); return getHanger(0); } @@ -111,7 +111,7 @@ public class RefHanger { * @return */ public Hanger getHanger(int relativePos) { - assert( hangers.contains(relativePos) ); + //assert hangers.contains(relativePos) : hangers + " " + relativePos; return hangers.get(relativePos); } @@ -209,7 +209,7 @@ public class RefHanger { // we have nothing, just push right pushRight(pos, datum); else { - assert( pos.compareTo(getRightLoc()) == 1 ); + //assert pos.compareTo(getRightLoc()) == 1 : pos + " " + getRightLoc() + " => " + pos.compareTo(getRightLoc()); GenomeLoc nextRight = getRightLoc().nextLoc(); while ( pos.compareTo(nextRight) == 1 ) { diff --git a/playground/java/src/org/broadinstitute/sting/utils/Utils.java b/playground/java/src/org/broadinstitute/sting/utils/Utils.java index 4616d791e..9e9e631ff 100755 --- a/playground/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/playground/java/src/org/broadinstitute/sting/utils/Utils.java @@ -16,6 +16,23 @@ import java.util.*; * To change this template use File | Settings | File Templates. */ public class Utils { + public static void warnUser(final String msg) { + System.out.printf("********************************************************************************%n"); + System.out.printf("* WARNING:%n"); + System.out.printf("*%n"); + System.out.printf("* %s%n", msg); + System.out.printf("********************************************************************************%n"); + } + + public static void scareUser(final String msg) { + System.out.printf("********************************************************************************%n"); + System.out.printf("* ERROR:%n"); + System.out.printf("*%n"); + System.out.printf("* %s%n", msg); + System.out.printf("********************************************************************************%n"); + throw new RuntimeException(msg); + } + public static List filter(Predicate pred, Collection c) { List filtered = new ArrayList(); // loop through all the elements in c @@ -142,7 +159,7 @@ public class Utils { } GenomeLoc.setContigOrdering(refContigOrdering); - return refContigOrdering != null; + return refContigs != null; } // Java Generics can't do primitive types, so I had to do this the simplistic way