Lots of error checking added, fixed bugs associated with reading files out of order, added support for U (unsafe) flag for processing reads
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@75 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
36b8b34490
commit
9ae551e858
|
|
@ -25,6 +25,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
@Option(shortName="T", doc="Type of analysis to run") public String Analysis_Name = null;
|
||||
@Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null;
|
||||
@Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false";
|
||||
@Option(shortName="U", doc="If true, enables unsafe operations, nothing will be checked at runtime. You better know what you are doing if you set this flag.", optional=false) public String UNSAFE = "false";
|
||||
|
||||
public static HashMap<String, Object> MODULES = new HashMap<String,Object>();
|
||||
public static void addModule(final String name, final Object walker) {
|
||||
|
|
@ -100,6 +101,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
engine.setLocation(REGION_STR);
|
||||
}
|
||||
|
||||
engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true"));
|
||||
|
||||
engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true"));
|
||||
//engine.testReference();
|
||||
|
||||
|
|
|
|||
|
|
@ -50,9 +50,13 @@ public class TraversalEngine {
|
|||
|
||||
// Name of the reads file, in BAM/SAM format
|
||||
private File readsFile = null; // the name of the reads file
|
||||
private SAMFileReader samReader = null;
|
||||
// iterator over the sam records in the readsFile
|
||||
private Iterator<SAMRecord> samReadIter = null;
|
||||
|
||||
// The verifying iterator, it does checking
|
||||
VerifyingSamIterator verifyingSamReadIter = null;
|
||||
|
||||
// The reference data -- filename, refSeqFile, and iterator
|
||||
private File refFileName = null; // the name of the reference file
|
||||
private ReferenceSequenceFile refFile = null;
|
||||
|
|
@ -72,6 +76,7 @@ public class TraversalEngine {
|
|||
private FileProgressTracker samReadingTracker = null;
|
||||
|
||||
public boolean DEBUGGING = false;
|
||||
public boolean beSafeP = true;
|
||||
public long N_RECORDS_TO_PRINT = 100000;
|
||||
public int THREADED_IO_BUFFER_SIZE = 10000;
|
||||
|
||||
|
|
@ -107,6 +112,11 @@ public class TraversalEngine {
|
|||
public void setStrictness( final ValidationStringency s ) { strictness = s; }
|
||||
public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; }
|
||||
public void setDebugging( final boolean d ) { DEBUGGING = d; }
|
||||
public void setSafetyChecking( final boolean beSafeP ) {
|
||||
if ( ! beSafeP )
|
||||
System.out.printf("*** Turning off safety checking, I hope you know what you are doing...%n");
|
||||
this.beSafeP = beSafeP;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -284,14 +294,19 @@ public class TraversalEngine {
|
|||
final FileInputStream samFileStream = new FileInputStream(readsFile);
|
||||
final InputStream bufferedStream= new BufferedInputStream(samFileStream);
|
||||
//final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000);
|
||||
final SAMFileReader samReader = new SAMFileReader(bufferedStream, true);
|
||||
samReader = new SAMFileReader(bufferedStream, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
System.err.println("Sort order is: " + header.getSortOrder());
|
||||
|
||||
samReadingTracker = new FileProgressTracker<SAMRecord>( readsFile, samReader.iterator(), samFileStream.getChannel(), 1000 );
|
||||
samReadIter = new VerifyingSamIterator(samReadingTracker);
|
||||
if ( beSafeP ) {
|
||||
verifyingSamReadIter = new VerifyingSamIterator(samReadingTracker);
|
||||
samReadIter = verifyingSamReadIter;
|
||||
} else {
|
||||
samReadIter = samReadingTracker;
|
||||
}
|
||||
|
||||
if ( THREADED_IO ) {
|
||||
System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE);
|
||||
|
|
@ -310,12 +325,12 @@ public class TraversalEngine {
|
|||
*
|
||||
*/
|
||||
protected void initializeReference() {
|
||||
if ( refFileName!= null ) {
|
||||
if ( refFileName != null ) {
|
||||
this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
|
||||
this.refIter = new ReferenceIterator(this.refFile);
|
||||
if ( ! Utils.setupRefContigOrdering(this.refFile) ) {
|
||||
// We couldn't process the reference contig ordering, fail since we need it
|
||||
throw new RuntimeException("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
|
||||
Utils.scareUser(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -413,6 +428,16 @@ public class TraversalEngine {
|
|||
}
|
||||
}
|
||||
|
||||
public void verifySortOrder(final boolean requiresSortedOrder) {
|
||||
if ( beSafeP && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) {
|
||||
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing";
|
||||
if ( requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT )
|
||||
throw new RuntimeIOException(msg);
|
||||
else if ( strictness == SAMFileReader.ValidationStringency.LENIENT )
|
||||
Utils.warnUser(msg);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and
|
||||
* the reference base for each locus in the reference to the LocusWalker walker. Supports all of the
|
||||
|
|
@ -424,6 +449,8 @@ public class TraversalEngine {
|
|||
* @return 0 on success
|
||||
*/
|
||||
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
|
||||
verifySortOrder(true);
|
||||
|
||||
// prepare the read filtering read iterator and provide it to a new locus iterator
|
||||
FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc());
|
||||
//LocusIterator iter = new SingleLocusIterator(filterIter);
|
||||
|
|
@ -490,13 +517,19 @@ public class TraversalEngine {
|
|||
* Traverse by read -- the key driver of linearly ordered traversal of reads. Provides a single read to
|
||||
* the walker object, in coordinate order. Supports all of the
|
||||
* interaction contract implied by the read walker
|
||||
*
|
||||
* sor
|
||||
* @param walker A read walker object
|
||||
* @param <M> MapType -- the result of calling map() on walker
|
||||
* @param <T> ReduceType -- the result of calling reduce() on the walker
|
||||
* @return 0 on success
|
||||
*/
|
||||
protected <M,R> int traverseByRead(ReadWalker<M,R> walker) {
|
||||
if ( refFileName == null && ! walker.requiresOrderedReads() ) {
|
||||
System.out.println("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.");
|
||||
verifyingSamReadIter.setCheckOrderP(false);
|
||||
}
|
||||
|
||||
verifySortOrder( refFileName != null || walker.requiresOrderedReads());
|
||||
|
||||
// Initialize the walker
|
||||
walker.initialize();
|
||||
|
|
@ -515,10 +548,11 @@ public class TraversalEngine {
|
|||
GenomeLoc loc = Utils.genomicLocationOf(read);
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
final ReferenceIterator refSite = refIter.seekForward(loc);
|
||||
final char refBase = refSite.getBaseAsChar();
|
||||
LocusContext locus = new LocusContext(loc, reads, offsets);
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
if ( ! loc.isUnmapped() && refIter != null ) {
|
||||
final ReferenceIterator refSite = refIter.seekForward(loc);
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
}
|
||||
|
||||
if ( inLocations(loc) ) {
|
||||
|
||||
|
|
|
|||
|
|
@ -94,6 +94,8 @@ public class ReferenceIterator implements Iterator<ReferenceIterator> {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public ReferenceIterator seekForward(final GenomeLoc loc) {
|
||||
assert loc != null : "seekForward location is null";
|
||||
|
||||
return seekForwardOffset(loc.getContig(), loc.getStart() - 1);
|
||||
}
|
||||
|
||||
|
|
@ -102,6 +104,9 @@ public class ReferenceIterator implements Iterator<ReferenceIterator> {
|
|||
}
|
||||
|
||||
private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) {
|
||||
assert contigName != null : "contigName is null";
|
||||
assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset;
|
||||
|
||||
// jumps us forward in the sequence to the contig / pos
|
||||
if ( currentContig == null )
|
||||
next();
|
||||
|
|
|
|||
|
|
@ -30,12 +30,21 @@ public class VerifyingSamIterator implements Iterator<SAMRecord> {
|
|||
SAMRecord cur = it.next();
|
||||
if ( last != null )
|
||||
verifyRecord(last, cur);
|
||||
last = cur;
|
||||
if ( ! cur.getReadUnmappedFlag() )
|
||||
last = cur;
|
||||
return cur;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, enables ordered checking of the reads in the file. By default this is enabled.
|
||||
* @param checkP If true, sam records will be checked to insure they come in order
|
||||
*/
|
||||
public void setCheckOrderP( boolean checkP ) {
|
||||
checkOrderP = checkP;
|
||||
}
|
||||
|
||||
public void verifyRecord( final SAMRecord last, final SAMRecord cur ) {
|
||||
if ( checkOrderP ) {
|
||||
if ( checkOrderP && ! cur.getReadUnmappedFlag() ) {
|
||||
GenomeLoc lastLoc = Utils.genomicLocationOf( last );
|
||||
GenomeLoc curLoc = Utils.genomicLocationOf( cur );
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.LocusContext;
|
|||
* Time: 3:22:14 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class BaseQualityHistoWalker implements ReadWalker<Integer, Integer> {
|
||||
public class BaseQualityHistoWalker extends BasicReadWalker<Integer, Integer> {
|
||||
long[] qualCounts = new long[100];
|
||||
|
||||
public void initialize() {
|
||||
|
|
@ -20,8 +20,6 @@ public class BaseQualityHistoWalker implements ReadWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
public String walkerType() { return "ByRead"; }
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(LocusContext context, SAMRecord read) {
|
||||
return true; // We are keeping all the reads
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
|||
public abstract class BasicReadWalker<MapType, ReduceType> implements ReadWalker<MapType, ReduceType> {
|
||||
public void initialize() { }
|
||||
public String walkerType() { return "ByRead"; }
|
||||
public boolean requiresOrderedReads() { return false; }
|
||||
|
||||
public boolean filter(LocusContext context, SAMRecord read) {
|
||||
// We are keeping all the reads
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.LocusContext;
|
|||
public interface ReadWalker<MapType, ReduceType> {
|
||||
void initialize();
|
||||
public String walkerType();
|
||||
public boolean requiresOrderedReads();
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
boolean filter(LocusContext context, SAMRecord read);
|
||||
|
|
|
|||
|
|
@ -123,6 +123,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
}
|
||||
|
||||
|
||||
public final boolean isUnmapped() { return this.contig == null; }
|
||||
public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; }
|
||||
public final boolean atBeginningOfContigP() { return this.start == 1; }
|
||||
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ public class RefHanger<T> {
|
|||
* @return
|
||||
*/
|
||||
public Hanger popLeft() {
|
||||
assert(hasHangers());
|
||||
assert hasHangers();
|
||||
return hangers.remove(0);
|
||||
}
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ public class RefHanger<T> {
|
|||
* @return
|
||||
*/
|
||||
public Hanger getLeft() {
|
||||
assert(hasHangers());
|
||||
assert hasHangers();
|
||||
return getHanger(0);
|
||||
}
|
||||
|
||||
|
|
@ -111,7 +111,7 @@ public class RefHanger<T> {
|
|||
* @return
|
||||
*/
|
||||
public Hanger getHanger(int relativePos) {
|
||||
assert( hangers.contains(relativePos) );
|
||||
//assert hangers.contains(relativePos) : hangers + " " + relativePos;
|
||||
return hangers.get(relativePos);
|
||||
}
|
||||
|
||||
|
|
@ -209,7 +209,7 @@ public class RefHanger<T> {
|
|||
// we have nothing, just push right
|
||||
pushRight(pos, datum);
|
||||
else {
|
||||
assert( pos.compareTo(getRightLoc()) == 1 );
|
||||
//assert pos.compareTo(getRightLoc()) == 1 : pos + " " + getRightLoc() + " => " + pos.compareTo(getRightLoc());
|
||||
|
||||
GenomeLoc nextRight = getRightLoc().nextLoc();
|
||||
while ( pos.compareTo(nextRight) == 1 ) {
|
||||
|
|
|
|||
|
|
@ -16,6 +16,23 @@ import java.util.*;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class Utils {
|
||||
public static void warnUser(final String msg) {
|
||||
System.out.printf("********************************************************************************%n");
|
||||
System.out.printf("* WARNING:%n");
|
||||
System.out.printf("*%n");
|
||||
System.out.printf("* %s%n", msg);
|
||||
System.out.printf("********************************************************************************%n");
|
||||
}
|
||||
|
||||
public static void scareUser(final String msg) {
|
||||
System.out.printf("********************************************************************************%n");
|
||||
System.out.printf("* ERROR:%n");
|
||||
System.out.printf("*%n");
|
||||
System.out.printf("* %s%n", msg);
|
||||
System.out.printf("********************************************************************************%n");
|
||||
throw new RuntimeException(msg);
|
||||
}
|
||||
|
||||
public static <T> List<T> filter(Predicate pred, Collection<T> c) {
|
||||
List<T> filtered = new ArrayList<T>();
|
||||
// loop through all the elements in c
|
||||
|
|
@ -142,7 +159,7 @@ public class Utils {
|
|||
}
|
||||
|
||||
GenomeLoc.setContigOrdering(refContigOrdering);
|
||||
return refContigOrdering != null;
|
||||
return refContigs != null;
|
||||
}
|
||||
|
||||
// Java Generics can't do primitive types, so I had to do this the simplistic way
|
||||
|
|
|
|||
Loading…
Reference in New Issue