diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 97c37f76b..26f24c4f3 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -28,12 +28,14 @@ import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; +import java.util.Arrays; +import java.util.Collections; public class GenomeAnalysisTK extends CommandLineProgram { public static GenomeAnalysisTK Instance = null; // parameters and their defaults - public File INPUT_FILE = null; + public List INPUT_FILES = null; public String MAX_READS_ARG = "-1"; public String STRICTNESS_ARG = "strict"; public File REF_FILE_ARG = null; @@ -106,7 +108,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { * Flags don't take an argument, the associated Boolean gets set to true if the flag appears on the command line. */ protected void setupArgs() { - m_parser.addOptionalArg("input_file", "I", "SAM or BAM file", "INPUT_FILE"); + m_parser.addOptionalArgList("input_file", "I", "SAM or BAM file", "INPUT_FILES"); //m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE"); m_parser.addOptionalArg("maximum_reads", "M", "Maximum number of reads to process before exiting", "MAX_READS_ARG"); m_parser.addOptionalArg("validation_strictness", "S", "How strict should we be with validation (LENIENT|SILENT|STRICT)", "STRICTNESS_ARG"); @@ -255,34 +257,34 @@ public class GenomeAnalysisTK extends CommandLineProgram { if ( REF_FILE_ARG == null ) Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given")); - if ( INPUT_FILE == null ) { + if ( INPUT_FILES == null || INPUT_FILES.size() == 0 ) { if ( walker.requiresReads() ) Utils.scareUser(String.format("Analysis %s requires reads, but none were given", Analysis_Name)); this.engine = new TraverseByReference(null, REF_FILE_ARG, rods); } else { if ( walker.cannotHandleReads() ) - Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE)); + Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILES)); if ( WALK_ALL_LOCI ) { // TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager // is not filtered. if( ENABLE_THREADING ) { logger.warn("Preliminary threading support enabled"); - microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads ); + microManager = new MicroManager( INPUT_FILES, REF_FILE_ARG, numThreads ); this.engine = microManager.getTraversalEngine(); } else { - this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); + this.engine = new TraverseByLociByReference(INPUT_FILES, REF_FILE_ARG, rods); } } else - this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); + this.engine = new TraverseByLoci(INPUT_FILES, REF_FILE_ARG, rods); } } else if ( my_walker instanceof IntervalWalker ) { - this.engine = new TraverseByIntervals(INPUT_FILE, REF_FILE_ARG, rods); + this.engine = new TraverseByIntervals(INPUT_FILES, REF_FILE_ARG, rods); } else { // we're a read walker - this.engine = new TraverseByReads(INPUT_FILE, REF_FILE_ARG, rods); + this.engine = new TraverseByReads(INPUT_FILES, REF_FILE_ARG, rods); } // Prepare the sort ordering w.r.t. the sequence dictionary diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/InvalidPositionException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/InvalidPositionException.java new file mode 100755 index 000000000..801f90780 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/InvalidPositionException.java @@ -0,0 +1,23 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.broadinstitute.sting.utils.StingException; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 16, 2009 + * Time: 4:11:40 PM + * + * Thrown to indicate invalid positions passed to the providers. + * Extend from RuntimeException to make it easier on our walker writers; don't make + * them catch every exception that comes their way. + */ +public class InvalidPositionException extends RuntimeException { + public InvalidPositionException(String message) { + super(message); + } + + public InvalidPositionException(String message, Throwable throwable) { + super(message,throwable); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java index 047a9e4b7..df46e9619 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java @@ -1,7 +1,9 @@ package org.broadinstitute.sting.gatk.dataSources.providers; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; +import net.sf.samtools.util.StringUtil; /** * Created by IntelliJ IDEA. @@ -11,17 +13,33 @@ import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; * To change this template use File | Settings | File Templates. */ public class ReferenceProvider { - private ReferenceIterator reference; + private ReferenceSequence referenceSequence; + private GenomeLoc referenceInterval; - public ReferenceProvider( ReferenceIterator reference ) { - this.reference = reference; + public ReferenceProvider( IndexedFastaSequenceFile sequenceFile, GenomeLoc position ) { + this.referenceSequence = sequenceFile.getSubsequenceAt( position.getContig(), + position.getStart(), + position.getStop() ); + this.referenceInterval = position; } - public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) { - if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 ) - throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs."); + public char getReferenceBase( GenomeLoc genomeLoc ) throws InvalidPositionException { + validateLocation( genomeLoc ); + int offset = (int)(genomeLoc.getStart() - referenceInterval.getStart()); + return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0); + } - // jump to the first reference site - return reference.seekForward(genomeLoc); + /** + * Validates that the genomeLoc is one base wide and is in the reference sequence. + * @param genomeLoc location to verify. + */ + private void validateLocation( GenomeLoc genomeLoc ) throws InvalidPositionException { + // + if( !referenceInterval.containsP(genomeLoc) ) + throw new InvalidPositionException( + String.format("Requested position %s not within interval %s", genomeLoc, referenceInterval)); + if( genomeLoc.getStart() != genomeLoc.getStop() ) + throw new InvalidPositionException( + String.format("Requested position larger than one base; start = %d, stop = %d", genomeLoc.getStart(), genomeLoc.getStop())); } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java index 540a7f358..2f7ced5f2 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java @@ -4,8 +4,10 @@ import org.broadinstitute.sting.gatk.iterators.BoundedReferenceIterator; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; +import java.io.FileNotFoundException; /** * @@ -35,8 +37,7 @@ import java.io.File; */ public class ReferenceDataSource implements SimpleDataSource { - final protected FastaSequenceFile2 refFile; - final protected ReferenceIterator refIter; + final protected IndexedFastaSequenceFile refFile; /** * Query the data source for a region of interest, specified by the genome location. @@ -46,7 +47,7 @@ public class ReferenceDataSource implements SimpleDataSource { * @return an iterator of the appropriate type, that is limited by the region */ public BoundedReferenceIterator seek(GenomeLoc location) { - BoundedReferenceIterator ret = new BoundedReferenceIterator(refIter.seekForward(location), location); + BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, location); return ret; } @@ -58,9 +59,11 @@ public class ReferenceDataSource implements SimpleDataSource { if (!infile.canRead()) { throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName); } - //this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName); - refFile = new FastaSequenceFile2(new File(refFileName)); - refIter = new ReferenceIterator(this.refFile); - + try { + refFile = new IndexedFastaSequenceFile(new File(refFileName)); + } + catch( FileNotFoundException ex ) { + throw new SimpleDataSourceLoadException( "Unable to find reference file", ex ); + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java index e65cd0406..0b84b7c9a 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java @@ -32,4 +32,8 @@ public class SimpleDataSourceLoadException extends StingException { public SimpleDataSourceLoadException(String msg) { super(msg); } + + public SimpleDataSourceLoadException(String msg,Throwable throwable) { + super(msg,throwable); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java index 802423e5f..25f79b9a2 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java @@ -9,19 +9,15 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; -import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; -import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; +import java.io.FileNotFoundException; import java.util.List; /** @@ -31,8 +27,8 @@ import java.util.List; public class MicroManager { private static long SHARD_SIZE = 100000L; - private File reads; - private FastaSequenceFile2 ref; + private List reads; + private IndexedFastaSequenceFile ref; private TraverseLociByReference traversalEngine = null; @@ -44,12 +40,17 @@ public class MicroManager { return traversalEngine; } - public MicroManager( File reads, // the reads file + public MicroManager( List reads, // the reads file(s) File refFile, // the reference file driving the traversal int nThreadsToUse ) { // maximum number of threads to use to do the work this.reads = reads; - ref = new FastaSequenceFile2(refFile); + try { + ref = new IndexedFastaSequenceFile(refFile); + } + catch( FileNotFoundException ex ) { + throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex); + } GenomeLoc.setupRefContigOrdering(ref); traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() ); @@ -72,34 +73,14 @@ public class MicroManager { ref.getSequenceDictionary(), SHARD_SIZE ); - ReferenceIterator refIter = new ReferenceIterator(ref); SAMDataSource dataSource = null; - try { - // todo: remove this code when we acutally handle command line args of multiple bam files - ArrayList fl = new ArrayList(); - if (reads.getName().endsWith(".list")) { - BufferedReader bis = new BufferedReader(new FileReader(reads)); - String line = null; - while ((line = bis.readLine()) != null) { - if (!line.equals("")){ - fl.add(new File(line)); - } - } - } else if (reads.getCanonicalPath().indexOf(",") > 0) { - for (String bamFile : reads.getCanonicalPath().split(",")) { - fl.add(new File(bamFile)); - } - - } else { - fl.add(reads); - } - dataSource = new SAMDataSource( fl ); + dataSource = new SAMDataSource( TraversalEngine.unpackReads(reads) ); } catch( SimpleDataSourceLoadException ex ) { throw new RuntimeException( ex ); } - catch( IOException ex ) { + catch( FileNotFoundException ex ) { throw new RuntimeException( ex ); } @@ -107,16 +88,17 @@ public class MicroManager { Object accumulator = null; for(Shard shard: shardStrategy) { - // CloseableIterator readShard = null; + GenomeLoc span = shard.getGenomeLoc(); + MergingSamRecordIterator2 readShard = null; try { - readShard = dataSource.seek( shard.getGenomeLoc() ); + readShard = dataSource.seek( span ); } catch( SimpleDataSourceLoadException ex ) { throw new RuntimeException( ex ); } - ReferenceProvider referenceProvider = new ReferenceProvider( refIter ); + ReferenceProvider referenceProvider = new ReferenceProvider( ref, span ); LocusContextProvider locusProvider = new LocusContextProvider( readShard ); // set the sam header of the traversal engine diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReferenceIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReferenceIterator.java index c916439f3..dcb23bcb0 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReferenceIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReferenceIterator.java @@ -2,9 +2,13 @@ package org.broadinstitute.sting.gatk.iterators; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.Iterator; +import edu.mit.broad.picard.reference.ReferenceSequence; +import net.sf.samtools.util.StringUtil; + /** * * User: aaron @@ -35,90 +39,43 @@ import java.util.Iterator; *

* TODO: Fix the underlying iterator and this class to model a real decorator pattern */ -public class BoundedReferenceIterator implements Iterator { +public class BoundedReferenceIterator implements Iterator { + private ReferenceSequence referenceSequence = null; + + // The start and end point of the read relative to the chromosome. + private long start; + private long end; + + // 0-based index into the base array. + private int position; // the location to screen over - private final GenomeLoc mLoc; - private final ReferenceIterator referenceIterator; /** * Default constructor * - * @param referenceIterator - * @param loc + * @param referenceSequenceFile sequence file over which to iterate + * @param loc subsequence of the reference over which to iterate. */ - public BoundedReferenceIterator(ReferenceIterator referenceIterator, GenomeLoc loc) { - this.referenceIterator = referenceIterator; - this.mLoc = loc; - } + public BoundedReferenceIterator(IndexedFastaSequenceFile referenceSequenceFile, GenomeLoc loc) { + start = loc.getStart(); + end = loc.getStop(); + position = 0; - - /** - * Create a BoundedReferenceIterator from a fasta seq and a genome loc - * @param refFile the fasta file, see the ReferenceIterator constructor - * @param loc our genome location - */ - public BoundedReferenceIterator(FastaSequenceFile2 refFile, GenomeLoc loc) { - this.referenceIterator = new ReferenceIterator(refFile); - this.mLoc = loc; - } - - /** - * isSubRegion - *

- * returns true if we include the whole passed in region - * - * @param loc the genome region to check - * @return true if we include THE WHOLE specified region - */ - protected boolean isSubRegion(GenomeLoc loc) { - // if the location is null, we assume we're all inclusive (we represent the whole genome). - if (mLoc == null || loc.isBetween(mLoc, mLoc)) { - return true; - } - return false; - } - - /** - * returns true if we include the whole passed in region - * - * @param contig - * @param start - * @param stop - * @return true if we enclose the passed region, false otherwise - */ - protected boolean isSubRegion(final String contig, final int start, final int stop) { - final GenomeLoc lc = new GenomeLoc(contig, start, stop); - return isSubRegion(lc); - } - - /** - * If we're less then the limiting genomeLoc - * - * @param loc - * @return - */ - protected boolean isLessThan(GenomeLoc loc) { - return loc.isPast(mLoc); + referenceSequence = referenceSequenceFile.getSubsequenceAt( loc.getContig(), position, end ); } // our adapted next function public boolean hasNext() { - // first check that we are within the search place - GenomeLoc loc = referenceIterator.getLocation(); - if (!isSubRegion(loc)) { - return false; - } - - return referenceIterator.hasNext(); + return (start + position) > end; } - public ReferenceIterator next() { - return referenceIterator.next(); + public Character next() { + return StringUtil.bytesToString( referenceSequence.getBases(), position++, 1 ).charAt(0); } public void remove() { - referenceIterator.remove(); + throw new UnsupportedOperationException("Cannot remove from a reference iterator"); } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index e05ea91b8..9b6d1f078 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -19,9 +19,7 @@ import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import java.util.*; public abstract class TraversalEngine { // list of reference ordered data objects @@ -44,7 +42,7 @@ public abstract class TraversalEngine { protected long maxReads = -1; // Name of the reads file, in BAM/SAM format - protected File readsFile = null; // the name of the reads file + protected List readsFiles = null; // the name of the reads file protected SAMFileReader samReader = null; // iterator over the sam records in the readsFile protected Iterator samReadIter = null; @@ -102,8 +100,8 @@ public abstract class TraversalEngine { * @param ref Reference file in FASTA format, assumes a .dict file is also available * @param rods Array of reference ordered data sets */ - public TraversalEngine(File reads, File ref, List> rods) { - readsFile = reads; + public TraversalEngine(List reads, File ref, List> rods) { + readsFiles = reads; refFileName = ref; this.rods = rods; } @@ -326,35 +324,66 @@ public abstract class TraversalEngine { return true; } + /** + * Unpack the files to be processed, given a list of files. That list of files can + * itself contain lists of other files to be read. + * @param inputFiles + * @return + */ + public static List unpackReads( List inputFiles ) throws FileNotFoundException { + List unpackedReads = new ArrayList(); + for( File inputFile: inputFiles ) { + if( inputFile.getName().endsWith(".list") ) { + for( String fileName : new xReadLines(inputFile) ) + unpackedReads.add( new File(fileName) ); + } + else + unpackedReads.add( inputFile ); + } + return unpackedReads; + } + protected Iterator initializeReads() { - samReader = initializeSAMFile(readsFile); + List allReadsFiles = null; + try { + allReadsFiles = unpackReads( readsFiles ); + } + catch( FileNotFoundException ex ) { + throw new RuntimeException("Unable to unpack reads", ex ); + } + + if( allReadsFiles.size() == 1 ) + samReader = initializeSAMFile(allReadsFiles.get(0)); + else + samReader = null; return WrapReadsIterator(getReadsIterator(samReader), true); } protected Iterator getReadsIterator(final SAMFileReader samReader) { // If the file has an index, querying functions are available. Use them if possible... - if ( samReader == null && readsFile.toString().endsWith(".list") ) { + if ( samReader == null && readsFiles.size() > 0 ) { SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; + List allReadsFiles = null; List readers = new ArrayList(); try { - for ( String fileName : new xReadLines(readsFile) ) { - SAMFileReader reader = initializeSAMFile(new File(fileName)); - readers.add(reader); - } + allReadsFiles = unpackReads( readsFiles ); + } + catch(FileNotFoundException ex) { + throw new RuntimeException("Unable to unpack reads", ex); + } - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); - return new MergingSamRecordIterator2(headerMerger); - } - catch ( FileNotFoundException e ) { - logger.fatal("Couldn't open file in sam file list: " + readsFile); + for ( File readsFile: allReadsFiles ) { + SAMFileReader reader = initializeSAMFile(readsFile); + readers.add(reader); } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); + return new MergingSamRecordIterator2(headerMerger); + } + else { + return samReader.iterator(); } - //if (samReader.hasIndex()) { - // return new SamQueryIterator(samReader, locs); - //} else { - return samReader.iterator(); - //} } protected Iterator WrapReadsIterator( final Iterator rawIterator, final boolean enableVerification ) { @@ -379,7 +408,7 @@ public abstract class TraversalEngine { return wrappedIterator; } - protected SAMFileReader initializeSAMFile(final File samFile) { + protected SAMFileReader initializeSAMFile(File samFile) { // todo: fixme, this is a hack to try out dynamic merging if ( samFile.toString().endsWith(".list") ) { return null; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java index 043a56ad3..8d2c15ed0 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java @@ -28,7 +28,7 @@ import edu.mit.broad.picard.filter.FilteringIterator; */ public class TraverseByIntervals extends TraversalEngine { - public TraverseByIntervals(File reads, File ref, List> rods) { + public TraverseByIntervals(List reads, File ref, List> rods) { super(reads, ref, rods); } @@ -55,7 +55,10 @@ public class TraverseByIntervals extends TraversalEngine { protected T traverseByIntervals(IntervalWalker walker, ArrayList locations) { logger.debug("Entering traverseByIntervals"); - samReader = initializeSAMFile(readsFile); + if(readsFiles.size() > 1) + throw new UnsupportedOperationException("Cannot do ByInterval traversal on file with multiple inputs."); + + samReader = initializeSAMFile(readsFiles.get(0)); verifySortOrder(true); diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index 599d3725d..0fef71eaf 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.traversals; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.LocusContext; @@ -14,7 +13,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import java.util.List; -import java.util.Arrays; import java.util.Iterator; import java.util.ArrayList; import java.io.File; @@ -32,7 +30,7 @@ import edu.mit.broad.picard.filter.FilteringIterator; */ public class TraverseByLoci extends TraversalEngine { - public TraverseByLoci(File reads, File ref, List> rods) { + public TraverseByLoci(List reads, File ref, List> rods) { super(reads, ref, rods); } @@ -59,8 +57,10 @@ public class TraverseByLoci extends TraversalEngine { protected T traverseByLoci(LocusWalker walker, ArrayList locations) { logger.debug("Entering traverseByLoci"); - samReader = initializeSAMFile(readsFile); + if(readsFiles.size() > 1) + throw new UnsupportedOperationException("Cannot do ByLoci traversal on file with multiple inputs."); + samReader = initializeSAMFile(readsFiles.get(0)); verifySortOrder(true); // initialize the walker object diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java index 5d5075d3f..83f3c808f 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java @@ -30,7 +30,7 @@ import edu.mit.broad.picard.filter.FilteringIterator; */ public class TraverseByLociByReference extends TraverseByLoci { - public TraverseByLociByReference(File reads, File ref, List> rods) { + public TraverseByLociByReference(List reads, File ref, List> rods) { super(reads, ref, rods); } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index 9b33c789c..24a86ee3a 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -27,7 +27,7 @@ import net.sf.samtools.SAMRecord; */ public class TraverseByReads extends TraversalEngine { - public TraverseByReads(File reads, File ref, List> rods) { + public TraverseByReads(List reads, File ref, List> rods) { super(reads, ref, rods); } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java index 69557a2a2..e49490b30 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java @@ -21,7 +21,7 @@ import net.sf.samtools.SAMRecord; */ public class TraverseByReference extends TraverseByLoci { - public TraverseByReference(File reads, File ref, List> rods) { + public TraverseByReference(List reads, File ref, List> rods) { super(reads, ref, rods); logger.debug("Creating TraverseByReference"); diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java index d08ae0da1..bd9cf4df1 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; +import org.broadinstitute.sting.gatk.dataSources.providers.InvalidPositionException; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,7 +34,7 @@ public class TraverseLociByReference extends TraversalEngine { protected static Logger logger = Logger.getLogger(TraversalEngine.class); - public TraverseLociByReference(File reads, File ref, List> rods) { + public TraverseLociByReference(List reads, File ref, List> rods) { super( reads, ref, rods ); } @@ -66,14 +67,13 @@ public class TraverseLociByReference extends TraversalEngine { // Iterate forward to get all reference ordered data covering this locus final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site ); - ReferenceIterator refSite = referenceProvider.getReferenceSequence( site ); LocusContext locus = locusProvider.getLocusContext( site ); - locus.setReferenceContig( refSite.getCurrentContig() ); + + char refBase = referenceProvider.getReferenceBase( site ); if ( DOWNSAMPLE_BY_COVERAGE ) locus.downsampleToCoverage(downsamplingCoverage); - final char refBase = refSite.getBaseAsChar(); final boolean keepMeP = locusWalker.filter(tracker, refBase, locus); if (keepMeP) { M x = locusWalker.map(tracker, refBase, locus); @@ -90,4 +90,4 @@ public class TraverseLociByReference extends TraversalEngine { return sum; } -} \ No newline at end of file +} diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java index 5701d94c0..abad9755f 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -113,14 +113,30 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { } } + /** + * Retrieves the sequence dictionary for the fasta file. + * @return sequence dictionary of the fasta. + */ public SAMSequenceDictionary getSequenceDictionary() { return sequenceDictionary; } + /** + * Retrieves the complete sequence described by this contig. + * @param contig contig whose data should be returned. + * @return The full sequence associated with this contig. + */ public ReferenceSequence getSequence( String contig ) { - return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize()-1 ); + return getSubsequenceAt( contig, 1, (int)index.getIndexEntry(contig).getSize() ); } + /** + * Gets the subsequence of the contig in the range [start,stop] + * @param contig Contig whose subsequence to retrieve. + * @param start inclusive, 1-based start of region. + * @param stop inclusive, 1-based stop of region. + * @return The partial reference sequence associated with this range. + */ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { if(start > stop) throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop)); @@ -140,8 +156,8 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { final int bytesPerLine = indexEntry.getBytesPerLine(); // Start reading at the closest start-of-line to our data. - long readStart = indexEntry.getLocation() + (start / basesPerLine) * bytesPerLine; - int dataOfInterestStart = (int)(start % basesPerLine); + long readStart = indexEntry.getLocation() + ((start-1) / basesPerLine) * bytesPerLine; + int dataOfInterestStart = (int)((start-1) % basesPerLine); byte[] accumulator = new byte[length]; int nextAccumulatorSlot = 0; diff --git a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java index f031f345d..265f02ebd 100755 --- a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java @@ -53,7 +53,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest { @Test public void testFirstSequence() { long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,firstBasesOfChrM.length()-1); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,firstBasesOfChrM.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); @@ -68,7 +68,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest { @Test public void testFirstSequenceExtended() { long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length()-1); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,extendedBasesOfChrM.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); @@ -87,8 +87,8 @@ public class IndexedFastaSequenceFileTest extends BaseTest { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", - bytesToChopOff, - bytesToChopOff + truncated.length() - 1); + bytesToChopOff + 1, + bytesToChopOff + truncated.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); @@ -107,8 +107,8 @@ public class IndexedFastaSequenceFileTest extends BaseTest { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", - bytesToChopOff, - bytesToChopOff + truncated.length() - 1); + bytesToChopOff + 1, + bytesToChopOff + truncated.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); @@ -204,7 +204,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest { @Test public void testFirstOfChr1() { long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",0,firstBasesOfChr1.length()-1); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",1,firstBasesOfChr1.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr1"); @@ -219,7 +219,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest { @Test public void testFirstOfChr8() { long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",0,firstBasesOfChr8.length()-1); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",1,firstBasesOfChr8.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr8");