Introduce new index-based fasta reader. Clean up MicroManager code, pushing necessary code back into TraversalEngine.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@531 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-04-24 19:40:21 +00:00
parent 3739682bef
commit e50ae97fe1
16 changed files with 212 additions and 175 deletions

View File

@ -28,12 +28,14 @@ import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
import java.util.Collections;
public class GenomeAnalysisTK extends CommandLineProgram {
public static GenomeAnalysisTK Instance = null;
// parameters and their defaults
public File INPUT_FILE = null;
public List<File> INPUT_FILES = null;
public String MAX_READS_ARG = "-1";
public String STRICTNESS_ARG = "strict";
public File REF_FILE_ARG = null;
@ -106,7 +108,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Flags don't take an argument, the associated Boolean gets set to true if the flag appears on the command line.
*/
protected void setupArgs() {
m_parser.addOptionalArg("input_file", "I", "SAM or BAM file", "INPUT_FILE");
m_parser.addOptionalArgList("input_file", "I", "SAM or BAM file", "INPUT_FILES");
//m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE");
m_parser.addOptionalArg("maximum_reads", "M", "Maximum number of reads to process before exiting", "MAX_READS_ARG");
m_parser.addOptionalArg("validation_strictness", "S", "How strict should we be with validation (LENIENT|SILENT|STRICT)", "STRICTNESS_ARG");
@ -255,34 +257,34 @@ public class GenomeAnalysisTK extends CommandLineProgram {
if ( REF_FILE_ARG == null )
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given"));
if ( INPUT_FILE == null ) {
if ( INPUT_FILES == null || INPUT_FILES.size() == 0 ) {
if ( walker.requiresReads() )
Utils.scareUser(String.format("Analysis %s requires reads, but none were given", Analysis_Name));
this.engine = new TraverseByReference(null, REF_FILE_ARG, rods);
} else {
if ( walker.cannotHandleReads() )
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILES));
if ( WALK_ALL_LOCI ) {
// TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager
// is not filtered.
if( ENABLE_THREADING ) {
logger.warn("Preliminary threading support enabled");
microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads );
microManager = new MicroManager( INPUT_FILES, REF_FILE_ARG, numThreads );
this.engine = microManager.getTraversalEngine();
}
else {
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
this.engine = new TraverseByLociByReference(INPUT_FILES, REF_FILE_ARG, rods);
}
}
else
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
this.engine = new TraverseByLoci(INPUT_FILES, REF_FILE_ARG, rods);
}
} else if ( my_walker instanceof IntervalWalker ) {
this.engine = new TraverseByIntervals(INPUT_FILE, REF_FILE_ARG, rods);
this.engine = new TraverseByIntervals(INPUT_FILES, REF_FILE_ARG, rods);
} else {
// we're a read walker
this.engine = new TraverseByReads(INPUT_FILE, REF_FILE_ARG, rods);
this.engine = new TraverseByReads(INPUT_FILES, REF_FILE_ARG, rods);
}
// Prepare the sort ordering w.r.t. the sequence dictionary

View File

@ -0,0 +1,23 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.utils.StingException;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 16, 2009
* Time: 4:11:40 PM
*
* Thrown to indicate invalid positions passed to the providers.
* Extend from RuntimeException to make it easier on our walker writers; don't make
* them catch every exception that comes their way.
*/
public class InvalidPositionException extends RuntimeException {
public InvalidPositionException(String message) {
super(message);
}
public InvalidPositionException(String message, Throwable throwable) {
super(message,throwable);
}
}

View File

@ -1,7 +1,9 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
/**
* Created by IntelliJ IDEA.
@ -11,17 +13,33 @@ import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
* To change this template use File | Settings | File Templates.
*/
public class ReferenceProvider {
private ReferenceIterator reference;
private ReferenceSequence referenceSequence;
private GenomeLoc referenceInterval;
public ReferenceProvider( ReferenceIterator reference ) {
this.reference = reference;
public ReferenceProvider( IndexedFastaSequenceFile sequenceFile, GenomeLoc position ) {
this.referenceSequence = sequenceFile.getSubsequenceAt( position.getContig(),
position.getStart(),
position.getStop() );
this.referenceInterval = position;
}
public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) {
if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 )
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
public char getReferenceBase( GenomeLoc genomeLoc ) throws InvalidPositionException {
validateLocation( genomeLoc );
int offset = (int)(genomeLoc.getStart() - referenceInterval.getStart());
return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0);
}
// jump to the first reference site
return reference.seekForward(genomeLoc);
/**
* Validates that the genomeLoc is one base wide and is in the reference sequence.
* @param genomeLoc location to verify.
*/
private void validateLocation( GenomeLoc genomeLoc ) throws InvalidPositionException {
//
if( !referenceInterval.containsP(genomeLoc) )
throw new InvalidPositionException(
String.format("Requested position %s not within interval %s", genomeLoc, referenceInterval));
if( genomeLoc.getStart() != genomeLoc.getStop() )
throw new InvalidPositionException(
String.format("Requested position larger than one base; start = %d, stop = %d", genomeLoc.getStart(), genomeLoc.getStop()));
}
}

View File

@ -4,8 +4,10 @@ import org.broadinstitute.sting.gatk.iterators.BoundedReferenceIterator;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.FileNotFoundException;
/**
*
@ -35,8 +37,7 @@ import java.io.File;
*/
public class ReferenceDataSource implements SimpleDataSource {
final protected FastaSequenceFile2 refFile;
final protected ReferenceIterator refIter;
final protected IndexedFastaSequenceFile refFile;
/**
* Query the data source for a region of interest, specified by the genome location.
@ -46,7 +47,7 @@ public class ReferenceDataSource implements SimpleDataSource {
* @return an iterator of the appropriate type, that is limited by the region
*/
public BoundedReferenceIterator seek(GenomeLoc location) {
BoundedReferenceIterator ret = new BoundedReferenceIterator(refIter.seekForward(location), location);
BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, location);
return ret;
}
@ -58,9 +59,11 @@ public class ReferenceDataSource implements SimpleDataSource {
if (!infile.canRead()) {
throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName);
}
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
refFile = new FastaSequenceFile2(new File(refFileName));
refIter = new ReferenceIterator(this.refFile);
try {
refFile = new IndexedFastaSequenceFile(new File(refFileName));
}
catch( FileNotFoundException ex ) {
throw new SimpleDataSourceLoadException( "Unable to find reference file", ex );
}
}
}

View File

@ -32,4 +32,8 @@ public class SimpleDataSourceLoadException extends StingException {
public SimpleDataSourceLoadException(String msg) {
super(msg);
}
public SimpleDataSourceLoadException(String msg,Throwable throwable) {
super(msg,throwable);
}
}

View File

@ -9,19 +9,15 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.io.FileNotFoundException;
import java.util.List;
/**
@ -31,8 +27,8 @@ import java.util.List;
public class MicroManager {
private static long SHARD_SIZE = 100000L;
private File reads;
private FastaSequenceFile2 ref;
private List<File> reads;
private IndexedFastaSequenceFile ref;
private TraverseLociByReference traversalEngine = null;
@ -44,12 +40,17 @@ public class MicroManager {
return traversalEngine;
}
public MicroManager( File reads, // the reads file
public MicroManager( List<File> reads, // the reads file(s)
File refFile, // the reference file driving the traversal
int nThreadsToUse ) { // maximum number of threads to use to do the work
this.reads = reads;
ref = new FastaSequenceFile2(refFile);
try {
ref = new IndexedFastaSequenceFile(refFile);
}
catch( FileNotFoundException ex ) {
throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex);
}
GenomeLoc.setupRefContigOrdering(ref);
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
@ -72,34 +73,14 @@ public class MicroManager {
ref.getSequenceDictionary(),
SHARD_SIZE );
ReferenceIterator refIter = new ReferenceIterator(ref);
SAMDataSource dataSource = null;
try {
// todo: remove this code when we acutally handle command line args of multiple bam files
ArrayList<File> fl = new ArrayList<File>();
if (reads.getName().endsWith(".list")) {
BufferedReader bis = new BufferedReader(new FileReader(reads));
String line = null;
while ((line = bis.readLine()) != null) {
if (!line.equals("")){
fl.add(new File(line));
}
}
} else if (reads.getCanonicalPath().indexOf(",") > 0) {
for (String bamFile : reads.getCanonicalPath().split(",")) {
fl.add(new File(bamFile));
}
} else {
fl.add(reads);
}
dataSource = new SAMDataSource( fl );
dataSource = new SAMDataSource( TraversalEngine.unpackReads(reads) );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
catch( IOException ex ) {
catch( FileNotFoundException ex ) {
throw new RuntimeException( ex );
}
@ -107,16 +88,17 @@ public class MicroManager {
Object accumulator = null;
for(Shard shard: shardStrategy) {
// CloseableIterator<SAMRecord> readShard = null;
GenomeLoc span = shard.getGenomeLoc();
MergingSamRecordIterator2 readShard = null;
try {
readShard = dataSource.seek( shard.getGenomeLoc() );
readShard = dataSource.seek( span );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
ReferenceProvider referenceProvider = new ReferenceProvider( refIter );
ReferenceProvider referenceProvider = new ReferenceProvider( ref, span );
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
// set the sam header of the traversal engine

View File

@ -2,9 +2,13 @@ package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.Iterator;
import edu.mit.broad.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
/**
*
* User: aaron
@ -35,90 +39,43 @@ import java.util.Iterator;
* <p/>
* TODO: Fix the underlying iterator and this class to model a real decorator pattern
*/
public class BoundedReferenceIterator implements Iterator<ReferenceIterator> {
public class BoundedReferenceIterator implements Iterator<Character> {
private ReferenceSequence referenceSequence = null;
// The start and end point of the read relative to the chromosome.
private long start;
private long end;
// 0-based index into the base array.
private int position;
// the location to screen over
private final GenomeLoc mLoc;
private final ReferenceIterator referenceIterator;
/**
* Default constructor
*
* @param referenceIterator
* @param loc
* @param referenceSequenceFile sequence file over which to iterate
* @param loc subsequence of the reference over which to iterate.
*/
public BoundedReferenceIterator(ReferenceIterator referenceIterator, GenomeLoc loc) {
this.referenceIterator = referenceIterator;
this.mLoc = loc;
}
public BoundedReferenceIterator(IndexedFastaSequenceFile referenceSequenceFile, GenomeLoc loc) {
start = loc.getStart();
end = loc.getStop();
position = 0;
/**
* Create a BoundedReferenceIterator from a fasta seq and a genome loc
* @param refFile the fasta file, see the ReferenceIterator constructor
* @param loc our genome location
*/
public BoundedReferenceIterator(FastaSequenceFile2 refFile, GenomeLoc loc) {
this.referenceIterator = new ReferenceIterator(refFile);
this.mLoc = loc;
}
/**
* isSubRegion
* <p/>
* returns true if we include the whole passed in region
*
* @param loc the genome region to check
* @return true if we include THE WHOLE specified region
*/
protected boolean isSubRegion(GenomeLoc loc) {
// if the location is null, we assume we're all inclusive (we represent the whole genome).
if (mLoc == null || loc.isBetween(mLoc, mLoc)) {
return true;
}
return false;
}
/**
* returns true if we include the whole passed in region
*
* @param contig
* @param start
* @param stop
* @return true if we enclose the passed region, false otherwise
*/
protected boolean isSubRegion(final String contig, final int start, final int stop) {
final GenomeLoc lc = new GenomeLoc(contig, start, stop);
return isSubRegion(lc);
}
/**
* If we're less then the limiting genomeLoc
*
* @param loc
* @return
*/
protected boolean isLessThan(GenomeLoc loc) {
return loc.isPast(mLoc);
referenceSequence = referenceSequenceFile.getSubsequenceAt( loc.getContig(), position, end );
}
// our adapted next function
public boolean hasNext() {
// first check that we are within the search place
GenomeLoc loc = referenceIterator.getLocation();
if (!isSubRegion(loc)) {
return false;
}
return referenceIterator.hasNext();
return (start + position) > end;
}
public ReferenceIterator next() {
return referenceIterator.next();
public Character next() {
return StringUtil.bytesToString( referenceSequence.getBases(), position++, 1 ).charAt(0);
}
public void remove() {
referenceIterator.remove();
throw new UnsupportedOperationException("Cannot remove from a reference iterator");
}

View File

@ -19,9 +19,7 @@ import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.*;
public abstract class TraversalEngine {
// list of reference ordered data objects
@ -44,7 +42,7 @@ public abstract class TraversalEngine {
protected long maxReads = -1;
// Name of the reads file, in BAM/SAM format
protected File readsFile = null; // the name of the reads file
protected List<File> readsFiles = null; // the name of the reads file
protected SAMFileReader samReader = null;
// iterator over the sam records in the readsFile
protected Iterator<SAMRecord> samReadIter = null;
@ -102,8 +100,8 @@ public abstract class TraversalEngine {
* @param ref Reference file in FASTA format, assumes a .dict file is also available
* @param rods Array of reference ordered data sets
*/
public TraversalEngine(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
readsFile = reads;
public TraversalEngine(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
readsFiles = reads;
refFileName = ref;
this.rods = rods;
}
@ -326,35 +324,66 @@ public abstract class TraversalEngine {
return true;
}
/**
* Unpack the files to be processed, given a list of files. That list of files can
* itself contain lists of other files to be read.
* @param inputFiles
* @return
*/
public static List<File> unpackReads( List<File> inputFiles ) throws FileNotFoundException {
List<File> unpackedReads = new ArrayList<File>();
for( File inputFile: inputFiles ) {
if( inputFile.getName().endsWith(".list") ) {
for( String fileName : new xReadLines(inputFile) )
unpackedReads.add( new File(fileName) );
}
else
unpackedReads.add( inputFile );
}
return unpackedReads;
}
protected Iterator<SAMRecord> initializeReads() {
samReader = initializeSAMFile(readsFile);
List<File> allReadsFiles = null;
try {
allReadsFiles = unpackReads( readsFiles );
}
catch( FileNotFoundException ex ) {
throw new RuntimeException("Unable to unpack reads", ex );
}
if( allReadsFiles.size() == 1 )
samReader = initializeSAMFile(allReadsFiles.get(0));
else
samReader = null;
return WrapReadsIterator(getReadsIterator(samReader), true);
}
protected Iterator<SAMRecord> getReadsIterator(final SAMFileReader samReader) {
// If the file has an index, querying functions are available. Use them if possible...
if ( samReader == null && readsFile.toString().endsWith(".list") ) {
if ( samReader == null && readsFiles.size() > 0 ) {
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
List<File> allReadsFiles = null;
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
try {
for ( String fileName : new xReadLines(readsFile) ) {
SAMFileReader reader = initializeSAMFile(new File(fileName));
readers.add(reader);
}
allReadsFiles = unpackReads( readsFiles );
}
catch(FileNotFoundException ex) {
throw new RuntimeException("Unable to unpack reads", ex);
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
return new MergingSamRecordIterator2(headerMerger);
}
catch ( FileNotFoundException e ) {
logger.fatal("Couldn't open file in sam file list: " + readsFile);
for ( File readsFile: allReadsFiles ) {
SAMFileReader reader = initializeSAMFile(readsFile);
readers.add(reader);
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
return new MergingSamRecordIterator2(headerMerger);
}
else {
return samReader.iterator();
}
//if (samReader.hasIndex()) {
// return new SamQueryIterator(samReader, locs);
//} else {
return samReader.iterator();
//}
}
protected Iterator<SAMRecord> WrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
@ -379,7 +408,7 @@ public abstract class TraversalEngine {
return wrappedIterator;
}
protected SAMFileReader initializeSAMFile(final File samFile) {
protected SAMFileReader initializeSAMFile(File samFile) {
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile.toString().endsWith(".list") ) {
return null;

View File

@ -28,7 +28,7 @@ import edu.mit.broad.picard.filter.FilteringIterator;
*/
public class TraverseByIntervals extends TraversalEngine {
public TraverseByIntervals(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseByIntervals(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
}
@ -55,7 +55,10 @@ public class TraverseByIntervals extends TraversalEngine {
protected <M, T> T traverseByIntervals(IntervalWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
logger.debug("Entering traverseByIntervals");
samReader = initializeSAMFile(readsFile);
if(readsFiles.size() > 1)
throw new UnsupportedOperationException("Cannot do ByInterval traversal on file with multiple inputs.");
samReader = initializeSAMFile(readsFiles.get(0));
verifySortOrder(true);

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.LocusContext;
@ -14,7 +13,6 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.Arrays;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.File;
@ -32,7 +30,7 @@ import edu.mit.broad.picard.filter.FilteringIterator;
*/
public class TraverseByLoci extends TraversalEngine {
public TraverseByLoci(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseByLoci(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
}
@ -59,8 +57,10 @@ public class TraverseByLoci extends TraversalEngine {
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
logger.debug("Entering traverseByLoci");
samReader = initializeSAMFile(readsFile);
if(readsFiles.size() > 1)
throw new UnsupportedOperationException("Cannot do ByLoci traversal on file with multiple inputs.");
samReader = initializeSAMFile(readsFiles.get(0));
verifySortOrder(true);
// initialize the walker object

View File

@ -30,7 +30,7 @@ import edu.mit.broad.picard.filter.FilteringIterator;
*/
public class TraverseByLociByReference extends TraverseByLoci {
public TraverseByLociByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseByLociByReference(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
}

View File

@ -27,7 +27,7 @@ import net.sf.samtools.SAMRecord;
*/
public class TraverseByReads extends TraversalEngine {
public TraverseByReads(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseByReads(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
}

View File

@ -21,7 +21,7 @@ import net.sf.samtools.SAMRecord;
*/
public class TraverseByReference extends TraverseByLoci {
public TraverseByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseByReference(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
logger.debug("Creating TraverseByReference");

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.InvalidPositionException;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -33,7 +34,7 @@ public class TraverseLociByReference extends TraversalEngine {
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
public TraverseLociByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
public TraverseLociByReference(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super( reads, ref, rods );
}
@ -66,14 +67,13 @@ public class TraverseLociByReference extends TraversalEngine {
// Iterate forward to get all reference ordered data covering this locus
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site );
ReferenceIterator refSite = referenceProvider.getReferenceSequence( site );
LocusContext locus = locusProvider.getLocusContext( site );
locus.setReferenceContig( refSite.getCurrentContig() );
char refBase = referenceProvider.getReferenceBase( site );
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
final char refBase = refSite.getBaseAsChar();
final boolean keepMeP = locusWalker.filter(tracker, refBase, locus);
if (keepMeP) {
M x = locusWalker.map(tracker, refBase, locus);
@ -90,4 +90,4 @@ public class TraverseLociByReference extends TraversalEngine {
return sum;
}
}
}

View File

@ -113,14 +113,30 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
}
}
/**
* Retrieves the sequence dictionary for the fasta file.
* @return sequence dictionary of the fasta.
*/
public SAMSequenceDictionary getSequenceDictionary() {
return sequenceDictionary;
}
/**
* Retrieves the complete sequence described by this contig.
* @param contig contig whose data should be returned.
* @return The full sequence associated with this contig.
*/
public ReferenceSequence getSequence( String contig ) {
return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize()-1 );
return getSubsequenceAt( contig, 1, (int)index.getIndexEntry(contig).getSize() );
}
/**
* Gets the subsequence of the contig in the range [start,stop]
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range.
*/
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
if(start > stop)
throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop));
@ -140,8 +156,8 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
final int bytesPerLine = indexEntry.getBytesPerLine();
// Start reading at the closest start-of-line to our data.
long readStart = indexEntry.getLocation() + (start / basesPerLine) * bytesPerLine;
int dataOfInterestStart = (int)(start % basesPerLine);
long readStart = indexEntry.getLocation() + ((start-1) / basesPerLine) * bytesPerLine;
int dataOfInterestStart = (int)((start-1) % basesPerLine);
byte[] accumulator = new byte[length];
int nextAccumulatorSlot = 0;

View File

@ -53,7 +53,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstSequence() {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,firstBasesOfChrM.length()-1);
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,firstBasesOfChrM.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM");
@ -68,7 +68,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstSequenceExtended() {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length()-1);
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,extendedBasesOfChrM.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM");
@ -87,8 +87,8 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",
bytesToChopOff,
bytesToChopOff + truncated.length() - 1);
bytesToChopOff + 1,
bytesToChopOff + truncated.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM");
@ -107,8 +107,8 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",
bytesToChopOff,
bytesToChopOff + truncated.length() - 1);
bytesToChopOff + 1,
bytesToChopOff + truncated.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM");
@ -204,7 +204,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstOfChr1() {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",0,firstBasesOfChr1.length()-1);
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",1,firstBasesOfChr1.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr1");
@ -219,7 +219,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstOfChr8() {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",0,firstBasesOfChr8.length()-1);
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",1,firstBasesOfChr8.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr8");