Merge pull request #1423 from broadinstitute/rhl_read_sra

Added support for directly reading SRA runs
This commit is contained in:
Ron Levine 2016-08-02 18:10:33 -04:00 committed by GitHub
commit 0630c7befd
61 changed files with 1007 additions and 518 deletions

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.bqsr;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.engine.recalibration.*;
import org.broadinstitute.gatk.engine.walkers.*;
@ -194,7 +194,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private ReferenceSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
@ -487,7 +488,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private HaplotypeCallerGenotypingEngine genotypingEngine = null;
// fasta reference reader to supplement the edges of the reference sequence
protected CachingIndexedFastaSequenceFile referenceReader;
protected ReferenceSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 500;
@ -683,12 +684,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet));
try {
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = CachingIndexedFastaSequenceFile.checkAndCreate(getToolkit().getArguments().referenceFile);
// create and setup the assembler
assemblyEngine = new ReadThreadingAssembler(RTAC.maxNumHaplotypesInPopulation, RTAC.kmerSizes, RTAC.dontIncreaseKmerSizesForCycles, RTAC.allowNonUniqueKmersInRef, RTAC.numPruningSamples);

View File

@ -51,7 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -71,7 +71,7 @@ import java.util.*;
public class VariantOverlapAnnotatorUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -76,5 +76,4 @@ public class BaseQualitySumPerAlleleBySampleUnitTest {
Assert.assertFalse(a.isUsableRead(read));
}
}

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.variant.variantcontext.VariantContext;
@ -207,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";

View File

@ -51,11 +51,11 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.gatk.utils.GenomeLoc;
@ -80,7 +80,7 @@ import java.util.*;
public class LocalAssemblyEngineUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private SAMFileHeader header;
@BeforeClass

View File

@ -51,9 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.indels;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
@ -76,7 +75,7 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest {
@BeforeClass
public void setup() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.utils;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
@ -71,7 +71,7 @@ public class ContigComparatorUnitTest extends BaseTest {
@BeforeClass
public void setup() throws FileNotFoundException {
// sequence
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser genomeLocParser = new GenomeLocParser(seq);
dictForFails = genomeLocParser.getContigs();
}
@ -81,7 +81,7 @@ public class ContigComparatorUnitTest extends BaseTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final String ref : Arrays.asList(b37KGReference, hg18Reference) ) {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref));
final GenomeLocParser genomeLocParser = new GenomeLocParser(seq);
final SAMSequenceDictionary dict = genomeLocParser.getContigs();

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.utils.genotyper;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.BaseUtils;
@ -64,7 +65,6 @@ import java.util.Map;
import java.util.List;
import org.testng.Assert;
import org.testng.annotations.Test;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
@ -81,7 +81,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
// example fasta index file, can be deleted if you don't use the reference
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -37,6 +37,7 @@ import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor;
import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.utils.text.ListFileUtils;
import htsjdk.samtools.sra.SRAAccession;
import java.util.ArrayList;
import java.util.Arrays;
@ -63,10 +64,19 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
/**
* A list of all the arguments initially used as sources.
*/
private final Collection<Object> argumentSources = new ArrayList<Object>();
private final Collection<Object> argumentSources = new ArrayList<>();
protected static Logger logger = Logger.getLogger(CommandLineExecutable.class);
private final static String SRA_LIBS_DOWNLOAD = "samjdk.sra_libraries_download";
/**
* Set GATK version to be used as part of user agent for network requests
*/
static {
System.setProperty(SRA_LIBS_DOWNLOAD, "true");
SRAAccession.setAppVersionString("GATK " + getVersionNumber());
}
/**
* this is the function that the inheriting class can expect to have called
* when the command line system has initialized.

View File

@ -905,7 +905,7 @@ public class GenomeAnalysisEngine {
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser,
final IndexedFastaSequenceFile refReader, final Map<String, String> sampleRenameMap) {
final ReferenceSequenceFile refReader, final Map<String, String> sampleRenameMap) {
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when

View File

@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.alignment.bwa.java;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.alignment.Aligner;
import org.broadinstitute.gatk.engine.alignment.Alignment;
import org.broadinstitute.gatk.utils.BaseUtils;
@ -126,7 +127,7 @@ public class AlignerTestHarness {
mismatches++;
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
final ReferenceSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION),
read.getAlignmentStart(),

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.providers;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.engine.datasources.reads.Shard;
import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource;
@ -62,7 +62,7 @@ public class LocusShardDataProvider extends ShardDataProvider {
* @param shard The chunk of data over which traversals happen.
* @param reference A getter for a section of the reference.
*/
public LocusShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
public LocusShardDataProvider(final Shard shard, final ReadProperties sourceInfo, final GenomeLocParser genomeLocParser, final GenomeLoc locus, final LocusIterator locusIterator, final ReferenceSequenceFile reference, final Collection<ReferenceOrderedDataSource> rods) {
super(shard,genomeLocParser,reference,rods);
this.sourceInfo = sourceInfo;
this.locus = locus;

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.engine.datasources.providers;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.datasources.reads.Shard;
import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
@ -50,7 +51,7 @@ public class ReadShardDataProvider extends ShardDataProvider {
* @param shard The chunk of data over which traversals happen.
* @param reference A getter for a section of the reference.
*/
public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
public ReadShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final GATKSAMIterator reads, final ReferenceSequenceFile reference, final Collection<ReferenceOrderedDataSource> rods) {
super(shard,genomeLocParser,reference,rods);
this.reads = reads;
}

View File

@ -29,6 +29,7 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
@ -61,13 +62,13 @@ public class ReferenceView implements View {
/**
* The source of reference data.
*/
protected IndexedFastaSequenceFile reference = null;
protected ReferenceSequenceFile reference = null;
/**
* Create a new ReferenceView.
* @param provider
*/
public ReferenceView( ShardDataProvider provider ) {
public ReferenceView( final ShardDataProvider provider ) {
this.genomeLocParser = provider.getGenomeLocParser();
this.reference = provider.getReference();
}

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.providers;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.datasources.reads.Shard;
import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -70,7 +70,7 @@ public abstract class ShardDataProvider {
/**
* Provider of reference data for this particular shard.
*/
private final IndexedFastaSequenceFile reference;
private final ReferenceSequenceFile reference;
/**
* Sources of reference-ordered data.
@ -106,7 +106,7 @@ public abstract class ShardDataProvider {
* Gets a pointer into the given indexed fasta sequence file.
* @return The indexed fasta sequence file.
*/
IndexedFastaSequenceFile getReference() {
ReferenceSequenceFile getReference() {
return reference;
}
@ -131,7 +131,7 @@ public abstract class ShardDataProvider {
* @param shard The chunk of data over which traversals happen.
* @param reference A getter for a section of the reference.
*/
public ShardDataProvider(Shard shard,GenomeLocParser genomeLocParser,IndexedFastaSequenceFile reference,Collection<ReferenceOrderedDataSource> rods) {
public ShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final ReferenceSequenceFile reference, final Collection<ReferenceOrderedDataSource> rods) {
this.shard = shard;
this.genomeLocParser = genomeLocParser;
this.reference = reference;

View File

@ -25,20 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.*;
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import htsjdk.samtools.Bin;
/**
* A basic interface for querying BAM indices.
@ -47,116 +34,24 @@ import java.util.List;
* @author mhanna
* @version 0.1
*/
public class GATKBAMIndex {
public abstract class GATKBAMIndex {
/**
* BAM index file magic number.
* What is the starting bin for each level?
*/
private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes();
protected static final int[] LEVEL_STARTS = {0,1,9,73,585,4681};
/**
* Reports the total amount of genomic data that any bin can index.
*/
protected static final int BIN_GENOMIC_SPAN = 512*1024*1024;
/**
* What is the starting bin for each level?
*/
private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681};
/**
* Reports the maximum number of bins that can appear in a BAM file.
*/
public static final int MAX_BINS = 37450; // =(8^6-1)/7+1
private final SAMSequenceDictionary sequenceDictionary;
private final File mFile;
//TODO: figure out a good value for this buffer size
private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192;
/**
* Number of sequences stored in this index.
*/
private final int sequenceCount;
/**
* A cache of the starting positions of the sequences.
*/
private final long[] sequenceStartCache;
private SeekableFileStream fileStream;
private SeekableStream baiStream;
private SeekableBufferedStream bufferedStream;
private long fileLength;
public GATKBAMIndex(final File file, final SAMSequenceDictionary sequenceDictionary) {
mFile = file;
this.sequenceDictionary = sequenceDictionary;
// Open the file stream.
openIndexFile();
// Verify the magic number.
seek(0);
final byte[] buffer = readBytes(4);
if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) {
throw new ReviewedGATKException("Invalid file header in BAM index " + mFile +
": " + new String(buffer));
}
seek(4);
sequenceCount = readInteger();
// Create a cache of the starting position of each sequence. Initialize it to -1.
sequenceStartCache = new long[sequenceCount];
for(int i = 1; i < sequenceCount; i++)
sequenceStartCache[i] = -1;
// Seed the first element in the array with the current position.
if(sequenceCount > 0)
sequenceStartCache[0] = position();
closeIndexFile();
}
public GATKBAMIndexData readReferenceSequence(final int referenceSequence) {
openIndexFile();
if (referenceSequence >= sequenceCount)
throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile);
skipToSequence(referenceSequence);
int binCount = readInteger();
List<GATKBin> bins = new ArrayList<>();
for (int binNumber = 0; binNumber < binCount; binNumber++) {
final int indexBin = readInteger();
final int nChunks = readInteger();
List<GATKChunk> chunks = new ArrayList<>(nChunks);
long[] rawChunkData = readLongs(nChunks*2);
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = rawChunkData[ci*2];
final long chunkEnd = rawChunkData[ci*2+1];
chunks.add(new GATKChunk(chunkBegin, chunkEnd));
}
GATKBin bin = new GATKBin(referenceSequence, indexBin);
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
while(indexBin >= bins.size())
bins.add(null);
bins.set(indexBin,bin);
}
final int nLinearBins = readInteger();
long[] linearIndexEntries = readLongs(nLinearBins);
LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
closeIndexFile();
return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex);
}
public abstract GATKBAMIndexData readReferenceSequence(final int referenceSequence);
/**
* Get the number of levels employed by this index.
@ -180,91 +75,35 @@ public class GATKBAMIndex {
* @param levelNumber Level number. 0-based.
* @return The size (number of possible bins) of the given level.
*/
public int getLevelSize(final int levelNumber) {
if(levelNumber == getNumIndexLevels()-1)
return MAX_BINS-LEVEL_STARTS[levelNumber]-1;
else
return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber];
}
public abstract int getLevelSize(final int levelNumber);
/**
* Gets the level associated with the given bin number.
* @param bin The bin for which to determine the level.
* @return the level associated with the given bin number.
*/
public int getLevelForBin(final Bin bin) {
GATKBin gatkBin = new GATKBin(bin);
if(gatkBin.getBinNumber() >= MAX_BINS)
throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile);
for(int i = getNumIndexLevels()-1; i >= 0; i--) {
if(gatkBin.getBinNumber() >= LEVEL_STARTS[i])
return i;
}
throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile);
}
public abstract int getLevelForBin(final Bin bin);
/**
* Gets the first locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getFirstLocusInBin(final Bin bin) {
final int level = getLevelForBin(bin);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1;
}
public abstract int getFirstLocusInBin(final Bin bin);
/**
* Gets the last locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getLastLocusInBin(final Bin bin) {
final int level = getLevelForBin(bin);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
public abstract int getLastLocusInBin(final Bin bin);
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
public long getStartOfLastLinearBin() {
openIndexFile();
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLongs(1)[0];
}
}
closeIndexFile();
return lastLinearIndexPointer;
}
public abstract long getStartOfLastLinearBin();
/**
* Gets the possible number of bins for a given reference sequence.
@ -273,197 +112,4 @@ public class GATKBAMIndex {
protected int getMaxAddressibleGenomicLocation() {
return BIN_GENOMIC_SPAN;
}
protected void skipToSequence(final int referenceSequence) {
// Find the offset in the file of the last sequence whose position has been determined. Start here
// when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always
// be present, so no extra stopping condition is necessary.
int sequenceIndex = referenceSequence;
while(sequenceStartCache[sequenceIndex] == -1)
sequenceIndex--;
// Advance to the most recently found position.
seek(sequenceStartCache[sequenceIndex]);
for (int i = sequenceIndex; i < referenceSequence; i++) {
sequenceStartCache[i] = position();
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j = 0; j < nBins; j++) {
/* final int bin = */
readInteger();
final int nChunks = readInteger();
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
// System.out.println("# nLinearBins: " + nLinearBins);
skipBytes(8 * nLinearBins);
}
sequenceStartCache[referenceSequence] = position();
}
private void openIndexFile() {
try {
fileStream = new SeekableFileStream(mFile);
baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary);
bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE);
fileLength=bufferedStream.length();
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc);
}
}
private void closeIndexFile() {
try {
bufferedStream.close();
baiStream.close();
fileStream.close();
fileLength = -1;
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to close index file " + mFile, exc);
}
}
private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
private byte[] readBytes(int count) {
ByteBuffer buffer = getBuffer(count);
read(buffer);
buffer.flip();
byte[] contents = new byte[count];
buffer.get(contents);
return contents;
}
private int readInteger() {
ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES);
read(buffer);
buffer.flip();
return buffer.getInt();
}
/**
* Reads an array of <count> longs from the file channel, returning the results as an array.
* @param count Number of longs to read.
* @return An array of longs. Size of array should match count.
*/
private long[] readLongs(final int count) {
ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES);
read(buffer);
buffer.flip();
long[] result = new long[count];
for(int i = 0; i < count; i++)
result[i] = buffer.getLong();
return result;
}
private void read(final ByteBuffer buffer) {
final int bytesRequested = buffer.limit();
if (bytesRequested == 0)
return;
try {
//BufferedInputStream cannot read directly into a byte buffer, so we read into an array
//and put the result into the bytebuffer after the if statement.
// We have a rigid expectation here to read in exactly the number of bytes we've limited
// our buffer to -- if there isn't enough data in the file, the index
// must be truncated or otherwise corrupt:
if(bytesRequested > fileLength - bufferedStream.position()){
throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
"It's likely that this file is truncated or corrupt -- " +
"Please try re-indexing the corresponding BAM file.",
mFile));
}
int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested);
// We have a rigid expectation here to read in exactly the number of bytes we've limited
// our buffer to -- if we encounter EOF (-1), the index
// must be truncated or otherwise corrupt:
if (bytesRead <= 0) {
throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
"It's likely that this file is truncated or corrupt -- " +
"Please try re-indexing the corresponding BAM file.",
mFile));
}
if(bytesRead != bytesRequested)
throw new RuntimeException("Read amount different from requested amount. This should not happen.");
buffer.put(byteArray, 0, bytesRequested);
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile);
}
}
/**
* A reusable buffer for use by this index generator.
* TODO: Should this be a SoftReference?
*/
private ByteBuffer buffer = null;
//BufferedStream don't read into ByteBuffers, so we need this temporary array
private byte[] byteArray=null;
private ByteBuffer getBuffer(final int size) {
if(buffer == null || buffer.capacity() < size) {
// Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging.
buffer = ByteBuffer.allocate(size);
byteArray = new byte[size];
buffer.order(ByteOrder.LITTLE_ENDIAN);
}
buffer.clear();
buffer.limit(size);
return buffer;
}
private void skipBytes(final int count) {
try {
//try to skip forward the requested amount.
long skipped = bufferedStream.skip(count);
if( skipped != count ) { //if not managed to skip the requested amount
throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile);
}
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile);
}
}
private void seek(final long position) {
try {
//to seek a new position, move the fileChannel, and reposition the bufferedStream
bufferedStream.seek(position);
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile);
}
}
/**
* Retrieve the position from the current file channel.
* @return position of the current file channel.
*/
private long position() {
try {
return bufferedStream.position();
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc);
}
}
}

View File

@ -0,0 +1,109 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* This class implements BAM index querying API
* by wrapping a class BrowseableBAMIndex from HTSJDK.
*
* @version 0.1
*/
public class GATKBAMIndexFromDataSource extends GATKBAMIndex {
private File sourceFile;
private SAMFileHeader sourceHeader;
private BrowseableBAMIndex index;
public GATKBAMIndexFromDataSource(final File sourceFile, final SAMFileHeader sourceHeader, final BrowseableBAMIndex index) {
this.sourceFile = sourceFile;
this.sourceHeader = sourceHeader;
this.index = index;
}
@Override
public GATKBAMIndexData readReferenceSequence(final int referenceSequence) {
final List<SAMSequenceRecord> sequences = sourceHeader.getSequenceDictionary().getSequences();
if (referenceSequence >= sequences.size())
throw new ReviewedGATKException("Sequence number " + referenceSequence + " cannot be greater or equal to " + sequences.size() + " in index file " + sourceFile);
final BinList sourceBins = index.getBinsOverlapping(referenceSequence, 0, sequences.get(referenceSequence).getSequenceLength());
final List<GATKBin> bins = new ArrayList<>();
for (Bin sourceBin : sourceBins) {
final int indexBin = sourceBin.getBinNumber();
while(indexBin >= bins.size())
bins.add(null);
final GATKBin bin = new GATKBin(referenceSequence, indexBin);
final List<Chunk> chunks = index.getSpanOverlapping(sourceBin).getChunks();
final List<GATKChunk> gatkChunks = new ArrayList<>(chunks.size());
for (Chunk chunk : chunks) {
gatkChunks.add(new GATKChunk(chunk));
}
bin.setChunkList(gatkChunks.toArray(new GATKChunk[gatkChunks.size()]));
bins.set(indexBin, bin);
}
// there is no interface to get linear index from HTSJDK
final LinearIndex linearIndex = new LinearIndex(referenceSequence, 0, new long[]{});
return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex);
}
@Override
public int getLevelSize(int levelNumber) {
return index.getLevelSize(levelNumber);
}
@Override
public int getLevelForBin(Bin bin) {
return index.getLevelForBin(bin);
}
@Override
public int getFirstLocusInBin(Bin bin) {
return index.getFirstLocusInBin(bin);
}
@Override
public int getLastLocusInBin(Bin bin) {
return index.getLastLocusInBin(bin);
}
@Override
public long getStartOfLastLinearBin() {
return index.getStartOfLastLinearBin();
}
}

View File

@ -0,0 +1,435 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.*;
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* This class implements BAM index querying API
* by reading .bai file directly.
* Very much not thread-safe.
*
* @author mhanna
* @version 0.1
*/
public class GATKBAMIndexFromFile extends GATKBAMIndex {
/**
* BAM index file magic number.
*/
private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes();
private final SAMSequenceDictionary sequenceDictionary;
private final File mFile;
//TODO: figure out a good value for this buffer size
private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192;
/**
* Number of sequences stored in this index.
*/
private final int sequenceCount;
/**
* A cache of the starting positions of the sequences.
*/
private final long[] sequenceStartCache;
private SeekableFileStream fileStream;
private SeekableStream baiStream;
private SeekableBufferedStream bufferedStream;
private long fileLength;
public GATKBAMIndexFromFile(final File file, final SAMSequenceDictionary sequenceDictionary) {
mFile = file;
this.sequenceDictionary = sequenceDictionary;
// Open the file stream.
openIndexFile();
// Verify the magic number.
seek(0);
final byte[] buffer = readBytes(4);
if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) {
throw new ReviewedGATKException("Invalid file header in BAM index " + mFile +
": " + new String(buffer));
}
seek(4);
sequenceCount = readInteger();
// Create a cache of the starting position of each sequence. Initialize it to -1.
sequenceStartCache = new long[sequenceCount];
for(int i = 1; i < sequenceCount; i++)
sequenceStartCache[i] = -1;
// Seed the first element in the array with the current position.
if(sequenceCount > 0)
sequenceStartCache[0] = position();
closeIndexFile();
}
public GATKBAMIndexData readReferenceSequence(final int referenceSequence) {
openIndexFile();
if (referenceSequence >= sequenceCount)
throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile);
skipToSequence(referenceSequence);
final int binCount = readInteger();
List<GATKBin> bins = new ArrayList<>();
for (int binNumber = 0; binNumber < binCount; binNumber++) {
final int indexBin = readInteger();
final int nChunks = readInteger();
final List<GATKChunk> chunks = new ArrayList<>(nChunks);
final long[] rawChunkData = readLongs(nChunks*2);
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = rawChunkData[ci*2];
final long chunkEnd = rawChunkData[ci*2+1];
chunks.add(new GATKChunk(chunkBegin, chunkEnd));
}
final GATKBin bin = new GATKBin(referenceSequence, indexBin);
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
while(indexBin >= bins.size())
bins.add(null);
bins.set(indexBin,bin);
}
final int nLinearBins = readInteger();
long[] linearIndexEntries = readLongs(nLinearBins);
final LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
closeIndexFile();
return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex);
}
/**
* Gets the number of bins in the given level.
* @param levelNumber Level number. 0-based.
* @return The size (number of possible bins) of the given level.
*/
@Override
public int getLevelSize(final int levelNumber) {
if(levelNumber == getNumIndexLevels()-1)
return MAX_BINS-LEVEL_STARTS[levelNumber]-1;
else
return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber];
}
/**
* Gets the level associated with the given bin number.
* @param bin The bin for which to determine the level.
* @return the level associated with the given bin number.
*/
@Override
public int getLevelForBin(final Bin bin) {
final GATKBin gatkBin = new GATKBin(bin);
if(gatkBin.getBinNumber() >= MAX_BINS)
throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile);
for(int i = getNumIndexLevels()-1; i >= 0; i--) {
if(gatkBin.getBinNumber() >= LEVEL_STARTS[i])
return i;
}
throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile);
}
/**
* Gets the first locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
@Override
public int getFirstLocusInBin(final Bin bin) {
final int level = getLevelForBin(bin);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1;
}
/**
* Gets the last locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
@Override
public int getLastLocusInBin(final Bin bin) {
final int level = getLevelForBin(bin);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
@Override
public long getStartOfLastLinearBin() {
openIndexFile();
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLongs(1)[0];
}
}
closeIndexFile();
return lastLinearIndexPointer;
}
protected void skipToSequence(final int referenceSequence) {
// Find the offset in the file of the last sequence whose position has been determined. Start here
// when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always
// be present, so no extra stopping condition is necessary.
int sequenceIndex = referenceSequence;
while(sequenceStartCache[sequenceIndex] == -1)
sequenceIndex--;
// Advance to the most recently found position.
seek(sequenceStartCache[sequenceIndex]);
for (int i = sequenceIndex; i < referenceSequence; i++) {
sequenceStartCache[i] = position();
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j = 0; j < nBins; j++) {
/* final int bin = */
readInteger();
final int nChunks = readInteger();
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
// System.out.println("# nLinearBins: " + nLinearBins);
skipBytes(8 * nLinearBins);
}
sequenceStartCache[referenceSequence] = position();
}
private void openIndexFile() {
try {
fileStream = new SeekableFileStream(mFile);
baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary);
bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE);
fileLength=bufferedStream.length();
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc);
}
}
private void closeIndexFile() {
try {
bufferedStream.close();
baiStream.close();
fileStream.close();
fileLength = -1;
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to close index file " + mFile, exc);
}
}
private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
private byte[] readBytes(int count) {
final ByteBuffer buffer = getBuffer(count);
read(buffer);
buffer.flip();
byte[] contents = new byte[count];
buffer.get(contents);
return contents;
}
private int readInteger() {
final ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES);
read(buffer);
buffer.flip();
return buffer.getInt();
}
/**
* Reads an array of <count> longs from the file channel, returning the results as an array.
* @param count Number of longs to read.
* @return An array of longs. Size of array should match count.
*/
private long[] readLongs(final int count) {
final ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES);
read(buffer);
buffer.flip();
long[] result = new long[count];
for(int i = 0; i < count; i++)
result[i] = buffer.getLong();
return result;
}
private void read(final ByteBuffer buffer) {
final int bytesRequested = buffer.limit();
if (bytesRequested == 0)
return;
try {
//BufferedInputStream cannot read directly into a byte buffer, so we read into an array
//and put the result into the bytebuffer after the if statement.
// We have a rigid expectation here to read in exactly the number of bytes we've limited
// our buffer to -- if there isn't enough data in the file, the index
// must be truncated or otherwise corrupt:
if(bytesRequested > fileLength - bufferedStream.position()){
throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
"It's likely that this file is truncated or corrupt -- " +
"Please try re-indexing the corresponding BAM file.",
mFile));
}
int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested);
// We have a rigid expectation here to read in exactly the number of bytes we've limited
// our buffer to -- if we encounter EOF (-1), the index
// must be truncated or otherwise corrupt:
if (bytesRead <= 0) {
throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
"It's likely that this file is truncated or corrupt -- " +
"Please try re-indexing the corresponding BAM file.",
mFile));
}
if(bytesRead != bytesRequested)
throw new RuntimeException("Read amount different from requested amount. This should not happen.");
buffer.put(byteArray, 0, bytesRequested);
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile);
}
}
/**
* A reusable buffer for use by this index generator.
* TODO: Should this be a SoftReference?
*/
private ByteBuffer buffer = null;
//BufferedStream don't read into ByteBuffers, so we need this temporary array
private byte[] byteArray=null;
private ByteBuffer getBuffer(final int size) {
if(buffer == null || buffer.capacity() < size) {
// Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging.
buffer = ByteBuffer.allocate(size);
byteArray = new byte[size];
buffer.order(ByteOrder.LITTLE_ENDIAN);
}
buffer.clear();
buffer.limit(size);
return buffer;
}
private void skipBytes(final int count) {
try {
//try to skip forward the requested amount.
final long skipped = bufferedStream.skip(count);
if( skipped != count ) { //if not managed to skip the requested amount
throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile);
}
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile);
}
}
private void seek(final long position) {
try {
//to seek a new position, move the fileChannel, and reposition the bufferedStream
bufferedStream.seek(position);
}
catch(IOException ex) {
throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile);
}
}
/**
* Retrieve the position from the current file channel.
* @return position of the current file channel.
*/
private long position() {
try {
return bufferedStream.position();
}
catch (IOException exc) {
throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc);
}
}
}

View File

@ -28,7 +28,10 @@ package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.sra.SRAAccession;
import htsjdk.samtools.sra.SRAIndexedSequenceFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.RuntimeIOException;
@ -51,7 +54,6 @@ import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter;
import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator;
import org.broadinstitute.gatk.utils.sam.SAMReaderID;
@ -310,7 +312,7 @@ public class SAMDataSource {
// Determine the sort order.
for(SAMReaderID readerID: readerIDs) {
if (! readerID.getSamFile().canRead() )
if (!SRAAccession.isValid(readerID.getSamFile().getPath()) && !readerID.getSamFile().canRead() )
throw new UserException.CouldNotReadInputFile(readerID.getSamFile(),"file is not present or user does not have appropriate permissions. " +
"Please check that the file is present and readable and try again.");
@ -377,15 +379,28 @@ public class SAMDataSource {
if (referenceFile == null) {
samSequenceDictionary = mergedHeader.getSequenceDictionary();
} else {
samSequenceDictionary = ReferenceSequenceFileFactory.
getReferenceSequenceFile(referenceFile).
getSequenceDictionary();
ReferenceSequenceFile ref;
// maybe it is SRA file?
if (SRAAccession.isValid(referenceFile.getPath())) {
ref = new SRAIndexedSequenceFile(new SRAAccession(referenceFile.getPath()));
} else {
ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile);
}
samSequenceDictionary = ref.getSequenceDictionary();
}
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.getSamFile());
if(indexFile != null)
bamIndices.put(id,new GATKBAMIndex(indexFile, samSequenceDictionary));
if(indexFile != null) {
bamIndices.put(id, new GATKBAMIndexFromFile(indexFile, samSequenceDictionary));
continue;
}
// if index file is not found, try SamReader indexing interface
SamReader reader = readers.getReader(id);
if (reader.indexing().hasBrowseableIndex()) {
bamIndices.put(id, new GATKBAMIndexFromDataSource(id.getSamFile(), reader.getFileHeader(), reader.indexing().getBrowseableIndex()));
}
}
resourcePool.releaseReaders(readers);
@ -1122,12 +1137,17 @@ public class SAMDataSource {
try {
if (threadAllocation.getNumIOThreads() > 0)
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
reader = SamReaderFactory.makeDefault()
.referenceSequence(referenceFile)
SamReaderFactory factory = SamReaderFactory.makeDefault()
.validationStringency(validationStringency)
.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false);
if (SRAAccession.isValid(readerID.getSamFile().getPath())) {
reader = factory.open(SamInputResource.of(new SRAAccession(readerID.getSamFile().getPath())));
} else {
reader = factory.referenceSequence(referenceFile)
.setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true)
.open(readerID.getSamFile());
}
} catch ( RuntimeIOException e ) {
throw new UserException.CouldNotReadInputFile(readerID.getSamFile(), e);

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.engine.datasources.reads.utilities;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.commandline.CommandLineProgram;
import org.broadinstitute.gatk.utils.commandline.Input;
@ -88,12 +89,12 @@ public class FindLargeShards extends CommandLineProgram {
@Override
public int execute() throws IOException {
// initialize reference
IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile);
GenomeLocParser genomeLocParser = new GenomeLocParser(refReader);
final ReferenceSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile);
final GenomeLocParser genomeLocParser = new GenomeLocParser(refReader);
// initialize reads
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser);
final SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser);
// intervals
final GenomeLocSortedSet intervalSortedSet;
@ -124,12 +125,12 @@ public class FindLargeShards extends CommandLineProgram {
}
// Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N
long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue();
long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue()));
final long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue();
final long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue()));
logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev));
// Crank through the shards again, this time reporting on the shards significantly larger than the mean.
long threshold = mean + stddev*5;
final long threshold = mean + stddev*5;
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.engine.datasources.reference;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.datasources.reads.LocusShard;
import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
import org.broadinstitute.gatk.engine.datasources.reads.Shard;
@ -46,7 +46,7 @@ import java.util.List;
* Looks for fai and dict files, and tries to create them if they don't exist
*/
public class ReferenceDataSource {
private IndexedFastaSequenceFile reference;
private ReferenceSequenceFile reference;
/** our log, which we want to capture anything from this class */
protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
@ -55,7 +55,7 @@ public class ReferenceDataSource {
* Create reference data source from fasta file
* @param fastaFile Fasta file to be used as reference
*/
public ReferenceDataSource(File fastaFile) {
public ReferenceDataSource(final File fastaFile) {
reference = CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile);
}
@ -63,7 +63,7 @@ public class ReferenceDataSource {
* Get indexed fasta file
* @return IndexedFastaSequenceFile that was created from file
*/
public IndexedFastaSequenceFile getReference() {
public ReferenceSequenceFile getReference() {
return this.reference;
}

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.executive;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.TribbleException;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
@ -111,7 +111,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final ReferenceSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.executive;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider;
@ -63,7 +63,7 @@ public class LinearMicroScheduler extends MicroScheduler {
protected LinearMicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final ReferenceSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);
@ -84,7 +84,6 @@ public class LinearMicroScheduler extends MicroScheduler {
Accumulator accumulator = Accumulator.create(engine,walker);
boolean done = walker.isDone();
int counter = 0;
final TraversalEngine traversalEngine = borrowTraversalEngine(this);
for (Shard shard : shardStrategy ) {

View File

@ -26,7 +26,7 @@
package org.broadinstitute.gatk.engine.executive;
import com.google.java.contract.Ensures;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.ReadMetrics;
@ -104,7 +104,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
*/
protected final GenomeAnalysisEngine engine;
protected final IndexedFastaSequenceFile reference;
protected final ReferenceSequenceFile reference;
private final SAMDataSource reads;
protected final Collection<ReferenceOrderedDataSource> rods;
@ -131,7 +131,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
*
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
public static MicroScheduler create(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final ReferenceSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods, final ThreadAllocation threadAllocation) {
if ( threadAllocation.isRunningInParallelMode() ) {
logger.info(String.format("Running the GATK in parallel mode with %d total threads, " +
"%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine",
@ -183,7 +184,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
protected MicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final ReferenceSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
this.engine = engine;
@ -397,7 +398,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* Returns the reference maintained by this scheduler.
* @return The reference maintained by this scheduler.
*/
public IndexedFastaSequenceFile getReference() { return reference; }
public ReferenceSequenceFile getReference() { return reference; }
protected void cleanup() {
try {

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.filters;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.WalkerManager;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
@ -41,7 +41,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
*/
public class BAQReadTransformer extends ReadTransformer {
private BAQ baqHMM;
private IndexedFastaSequenceFile refReader;
private ReferenceSequenceFile refReader;
private BAQ.CalculationMode cmode;
private BAQ.QualityMode qmode;

View File

@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.walkers;
import com.google.java.contract.Ensures;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -181,7 +182,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
// Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, final ReferenceSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionTraversalParameters.class).extension();
final List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
for( final GenomeLoc interval : intervals.toList() ) {

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.engine;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.ValidationExclusion;
@ -82,7 +82,7 @@ public class ReadMetricsUnitTest extends BaseTest {
// Test the accuracy of the read metrics
private File referenceFile;
private IndexedFastaSequenceFile reference;
private ReferenceSequenceFile reference;
private SAMSequenceDictionary dictionary;
private SAMFileHeader header;
private GATKSAMReadGroupRecord readGroup;

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.providers;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.commandline.Tags;
@ -71,7 +72,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
/**
* Sequence file.
*/
private static IndexedFastaSequenceFile seq;
private static ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**

View File

@ -26,7 +26,7 @@
package org.broadinstitute.gatk.engine.datasources.providers;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -57,7 +57,7 @@ public abstract class ReferenceViewTemplate extends BaseTest {
/**
* The fasta, for comparison.
*/
protected IndexedFastaSequenceFile sequenceFile = null;
protected ReferenceSequenceFile sequenceFile = null;
protected GenomeLocParser genomeLocParser = null;
//

View File

@ -25,9 +25,9 @@
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.GATKBAMFileSpan;
import htsjdk.samtools.GATKChunk;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -45,7 +45,7 @@ import java.io.FileNotFoundException;
*
*/
public class FilePointerUnitTest extends BaseTest {
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
private SAMReaderID readerID = new SAMReaderID("samFile",new Tags());

View File

@ -0,0 +1,98 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
/**
* Test basic functionality in the GATK's wrapper around htsjdk.samtools.BrowseableBAMIndex.
*/
public class GATKBAMIndexFromDataSourceUnitTest extends BaseTest {
private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam");
/**
* Storage for the index itself.
*/
private GATKBAMIndex bamIndex;
@BeforeClass
public void init() throws IOException {
final SAMFileReader reader = new SAMFileReader(bamFile);
reader.enableIndexCaching(true); // needed ot get BrowseableBAMIndex
Assert.assertTrue(reader.hasIndex());
Assert.assertTrue(reader.indexing().hasBrowseableIndex());
bamIndex = new GATKBAMIndexFromDataSource(bamFile, reader.getFileHeader(), reader.indexing().getBrowseableIndex());
reader.close();
}
@Test
public void testNumberAndSizeOfIndexLevels() {
// The correct values for this test are pulled directly from the
// SAM Format Specification v1.3-r882, Section 4.1.1, last paragraph.
Assert.assertEquals(GATKBAMIndex.getNumIndexLevels(),6,"Incorrect number of levels in BAM index");
// Level 0
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(0),0);
Assert.assertEquals(bamIndex.getLevelSize(0),1);
// Level 1
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(1),1);
Assert.assertEquals(bamIndex.getLevelSize(1),8-1+1);
// Level 2
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(2),9);
Assert.assertEquals(bamIndex.getLevelSize(2),72-9+1);
// Level 3
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(3),73);
Assert.assertEquals(bamIndex.getLevelSize(3),584-73+1);
// Level 4
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(4),585);
Assert.assertEquals(bamIndex.getLevelSize(4),4680-585+1);
// Level 5
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681);
// Need to wait unitl AbstractBAMFileIndex.getLevelSize() is fixed, will throw an ArrayIndexOutOfBoundsException if 1 before the end of the GenomicIndexUtil.LEVEL_STARTS array
//Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1);
}
@Test( expectedExceptions = ReviewedGATKException.class )
public void testTooManyReferenceSequencesDataSource() {
final GATKBAMIndexData index = bamIndex.readReferenceSequence(1);
}
}

View File

@ -39,7 +39,7 @@ import java.io.FileNotFoundException;
/**
* Test basic functionality in the GATK's implementation of the BAM index classes.
*/
public class GATKBAMIndexUnitTest extends BaseTest {
public class GATKBAMIndexFromFileUnitTest extends BaseTest {
private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam");
/**
@ -60,11 +60,11 @@ public class GATKBAMIndexUnitTest extends BaseTest {
@BeforeClass
public void init() throws FileNotFoundException {
SAMFileReader reader = new SAMFileReader(bamFile);
this.sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
final SAMFileReader reader = new SAMFileReader(bamFile);
sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
reader.close();
bamIndex = new GATKBAMIndex(bamIndexFile, sequenceDictionary);
bamIndex = new GATKBAMIndexFromFile(bamIndexFile, sequenceDictionary);
}
@Test
@ -100,13 +100,13 @@ public class GATKBAMIndexUnitTest extends BaseTest {
@Test( expectedExceptions = UserException.MalformedFile.class )
public void testDetectTruncatedBamIndexWordBoundary() {
GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary);
final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary);
index.readReferenceSequence(0);
}
@Test( expectedExceptions = UserException.MalformedFile.class )
public void testDetectTruncatedBamIndexNonWordBoundary() {
GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary);
public void testDetectTruncatedBamIndeFromFilexNonWordBoundary() {
final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary);
index.readReferenceSequence(0);
}

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.ValidationExclusion;
@ -64,7 +64,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
private List<SAMReaderID> readers;
private File referenceFile;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.rmd;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder;
import org.testng.Assert;
@ -68,7 +69,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
private RMDTriplet triplet = null;
private RMDTrackBuilder builder = null;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
private GenomeLoc testSite1;

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.datasources.rmd;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.Tags;
@ -46,7 +46,7 @@ public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{
// Build up query parameters
File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf");
RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags());
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference));
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference));
GenomeLocParser parser = new GenomeLocParser(seq);
GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000);
TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser);

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.engine.traversals;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialBAMBuilder;
@ -43,7 +43,7 @@ import java.util.List;
public class TAROrderedReadCacheUnitTest extends BaseTest {
// example fasta index file, can be deleted if you don't use the reference
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.traversals;
import com.google.java.contract.PreconditionError;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.utils.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.*;
@ -39,7 +40,6 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
import org.broadinstitute.gatk.utils.interval.IntervalUtils;
import org.broadinstitute.gatk.utils.sam.*;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider;
@ -80,7 +80,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
}
private File referenceFile;
private IndexedFastaSequenceFile reference;
private ReferenceSequenceFile reference;
private SAMSequenceDictionary dictionary;
private GenomeLocParser genomeLocParser;

View File

@ -25,7 +25,6 @@
package org.broadinstitute.gatk.engine.traversals;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker;
import org.broadinstitute.gatk.utils.BaseTest;
@ -90,7 +89,7 @@ public class TraverseReadsUnitTest extends BaseTest {
private File output;
private TraverseReadsNano traversalEngine = null;
private IndexedFastaSequenceFile ref = null;
private ReferenceSequenceFile ref = null;
private GenomeLocParser genomeLocParser = null;
private GenomeAnalysisEngine engine = null;

View File

@ -418,6 +418,7 @@
<queuetest.run>${gatk.queuetests.run}</queuetest.run>
<java.io.tmpdir>${java.io.tmpdir}</java.io.tmpdir>
</systemPropertyVariables>
<groups>sra</groups>
</configuration>
<executions>
<execution>

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.tools.walkers.qc;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -70,7 +70,7 @@ public class QCRef extends RefWalker<Integer, Integer> {
String contigName = "";
int contigStart, contigEnd;
IndexedFastaSequenceFile uncachedRef;
ReferenceSequenceFile uncachedRef;
byte[] uncachedBases;
@Override

View File

@ -27,6 +27,7 @@ package org.broadinstitute.gatk.tools.walkers.varianteval;
import com.google.java.contract.Requires;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.IntervalTree;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
@ -282,7 +283,7 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);
// Ancestral alignments
private IndexedFastaSequenceFile ancestralAlignments = null;
private ReferenceSequenceFile ancestralAlignments = null;
// The set of all possible evaluation contexts
StratificationManager<VariantStratifier, EvaluationContext> stratManager;

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.tools.walkers.filters;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -51,7 +51,7 @@ public class VariantFiltrationUnitTest extends BaseTest {
@BeforeTest
public void before() {
// Create GenomeLoc
IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta"));
ReferenceSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta"));
GenomeLocParser genomeLocParser = new GenomeLocParser(fasta);
chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName();
genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10);

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.tools.walkers.indels;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
@ -44,7 +44,7 @@ public class IndelRealignerUnitTest extends BaseTest {
@BeforeClass
public void setup() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}

View File

@ -0,0 +1,143 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.readutils;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.testng.SkipException;
import org.testng.annotations.BeforeGroups;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Collections;
import htsjdk.samtools.sra.SRAAccession;
import java.lang.reflect.Method;
public class SRAPrintReadsIntegrationTest extends WalkerTest {
private static class PRTest {
final String accession;
final String args;
final String md5;
private PRTest(String accession, String args, String md5) {
this.accession = accession;
this.args = args;
this.md5 = md5;
}
@Override
public String toString() {
return String.format("PRTest(accession='%s', args='%s')", accession, args);
}
}
private static boolean canResolveNetworkAccession = false;
private static String checkAccession = "SRR000123";
private static final String REMOTE_ACCESSION_PATTERN = "^[SED]RR[0-9]{6,9}$";
/**
* Are the SRA native libraries loaded and initialized? Does the test accession have a valid name?
*/
@BeforeGroups(groups = {"sra"})
public final void checkIfCanResolve() {
// Did SRA successfully load the native libraries and are fully initialized?
if (!SRAAccession.isSupported()) {
return;
}
// Is this is a valid SRA accession?
canResolveNetworkAccession = SRAAccession.isValid(checkAccession);
}
/**
* Are the SRA native libraries loaded and initialized?
*
* @throws SkipException if the SRA native libraries are loaded and initialized
*/
@BeforeMethod
public final void assertSRAIsSupported(){
if(!SRAAccession.isSupported()){
throw new SkipException("Skipping SRA Test because SRA native code is unavailable.");
}
}
/**
* Skip network SRA Test because cannot resolve remote SRA accession
*
* @param method Provides information about, and access to, a single method on a class or interface
* @param params Method parameters
* @throws SkipException if cannot resold an SRA accession
*/
@BeforeMethod
public void skipIfCantResolve(Method method, Object[] params) {
String accession = null;
if (params.length > 0) {
Object firstParam = params[0];
if (firstParam instanceof PRTest) {
accession = ((PRTest)firstParam).accession;
}
}
if (accession != null &&
accession.matches(REMOTE_ACCESSION_PATTERN) && !canResolveNetworkAccession) {
throw new SkipException("Skipping network SRA Test because cannot resolve remote SRA accession '" +
checkAccession + "'.");
}
}
@DataProvider(name = "PRTest")
public Object[][] createPrintReadsTestData() {
return new Object[][]{
// Test with local SRA accessions
{new PRTest(privateTestDir + "ERR1214757.sra", "", "173ed87acc794a704aa000c6ab5d63a8")},
{new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")},
{new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")},
// Tests with remote SRA accessions
{new PRTest("ERR1214757", "", "173ed87acc794a704aa000c6ab5d63a8")},
{new PRTest("ERR1214757", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")},
{new PRTest("ERR1214757", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")},
};
}
@Test(groups = "sra", dataProvider = "PRTest")
public void testPrintReads(PRTest params) {
WalkerTestSpec spec = new WalkerTestSpec(
"-T PrintReads" +
" -R " + params.accession +
" -I " + params.accession +
params.args +
" --no_pg_tag" +
" -o %s",
Collections.singletonList(params.md5));
executeTest("testPrintReads-"+params.args, spec).getFirst();
}
}

View File

@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils.activeregion;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
@ -168,7 +168,7 @@ public class ActiveRegion implements HasGenomeLocation {
/**
* See #getActiveRegionReference but with padding == 0
*/
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader ) {
return getActiveRegionReference(referenceReader, 0);
}
@ -182,21 +182,21 @@ public class ActiveRegion implements HasGenomeLocation {
* @return a non-null array of bytes holding the reference bases in referenceReader
*/
@Ensures("result != null")
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader, final int padding ) {
return getReference(referenceReader, padding, extendedLoc);
}
/**
* See #getActiveRegionReference but using the span including regions not the extended span
*/
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
public byte[] getFullReference( final ReferenceSequenceFile referenceReader ) {
return getFullReference(referenceReader, 0);
}
/**
* See #getActiveRegionReference but using the span including regions not the extended span
*/
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
public byte[] getFullReference( final ReferenceSequenceFile referenceReader, final int padding ) {
return getReference(referenceReader, padding, spanIncludingReads);
}
@ -211,7 +211,7 @@ public class ActiveRegion implements HasGenomeLocation {
* @return a non-null array of bytes holding the reference bases in referenceReader
*/
@Ensures("result != null")
public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
public byte[] getReference( final ReferenceSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");

View File

@ -25,12 +25,12 @@
package org.broadinstitute.gatk.utils.baq;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
@ -151,7 +151,7 @@ public class BAQ {
* @param b band width
* @param minBaseQual All bases with Q < minBaseQual are up'd to this value
*/
public BAQ(final double d, final double e, final int b, final byte minBaseQual, boolean includeClippedBases) {
public BAQ(final double d, final double e, final int b, final byte minBaseQual, final boolean includeClippedBases) {
cd = d; ce = e; cb = b;
this.minBaseQual = minBaseQual;
this.includeClippedBases = includeClippedBases;
@ -521,7 +521,7 @@ public class BAQ {
}
}
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final ReferenceSequenceFile refReader) {
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
int offset = getBandWidth() / 2;
long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart();
@ -540,7 +540,7 @@ public class BAQ {
// final SimpleTimer total = new SimpleTimer();
// final SimpleTimer local = new SimpleTimer();
// int n = 0;
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) {
public BAQCalculationResult calcBAQFromHMM(final byte[] ref, final byte[] query, final byte[] quals, final int queryStart, final int queryEnd ) {
// total.restart();
if ( queryStart < 0 ) throw new ReviewedGATKException("BUG: queryStart < 0: " + queryStart);
if ( queryEnd < 0 ) throw new ReviewedGATKException("BUG: queryEnd < 0: " + queryEnd);
@ -564,7 +564,7 @@ public class BAQ {
* @param read
* @return
*/
private final Pair<Integer,Integer> calculateQueryRange(SAMRecord read) {
private final Pair<Integer,Integer> calculateQueryRange(final SAMRecord read) {
int queryStart = -1, queryStop = -1;
int readI = 0;
@ -599,7 +599,7 @@ public class BAQ {
}
// we need to pad ref by at least the bandwidth / 2 on either side
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final byte[] ref, final int refOffset) {
// todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
Pair<Integer, Integer> queryRange = calculateQueryRange(read);
if ( queryRange == null ) return null; // read has Ns, or is completely clipped away
@ -642,7 +642,7 @@ public class BAQ {
return baqResult;
}
public byte capBaseByBAQ( byte oq, byte bq, int state, int expectedPos ) {
public byte capBaseByBAQ( final byte oq, final byte bq, final int state, final int expectedPos ) {
byte b;
boolean isIndel = stateIsIndel(state);
int pos = stateAlignedPosition(state);
@ -664,7 +664,7 @@ public class BAQ {
* @param calculationType
* @return BQ qualities for use, in case qmode is DONT_MODIFY
*/
public byte[] baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, CalculationMode calculationType, QualityMode qmode ) {
public byte[] baqRead(final SAMRecord read, final ReferenceSequenceFile refReader, final CalculationMode calculationType, final QualityMode qmode ) {
if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());
byte[] BAQQuals = read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to them
@ -706,7 +706,7 @@ public class BAQ {
* @param read
* @return
*/
public boolean excludeReadFromBAQ(SAMRecord read) {
public boolean excludeReadFromBAQ(final SAMRecord read) {
// keeping mapped reads, regardless of pairing status, or primary alignment status.
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag();
}

View File

@ -25,6 +25,9 @@
package org.broadinstitute.gatk.utils.fasta;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.sra.SRAAccession;
import htsjdk.samtools.sra.SRAIndexedSequenceFile;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.reference.FastaSequenceIndex;
@ -192,8 +195,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* Possibly may be better as an overloaded constructor.
* @param fastaFile Fasta file to be used as reference
* @return A new instance of a CachingIndexedFastaSequenceFile.
* @throws IllegalArgumentException if Fasta file is null
*/
public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) {
public static ReferenceSequenceFile checkAndCreate(final File fastaFile) {
if ( fastaFile == null ) {
throw new IllegalArgumentException("Fasta file is null");
}
// maybe it is SRA file?
if (SRAAccession.isValid(fastaFile.getPath())) {
return new SRAIndexedSequenceFile(new SRAAccession(fastaFile.getPath()));
}
// does the fasta file exist? check that first...
if (!fastaFile.exists())
throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist.");

View File

@ -25,10 +25,10 @@
package org.broadinstitute.gatk.utils.locusiterator;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.CommandLineProgram;
@ -62,7 +62,7 @@ public class LIBSPerformance extends CommandLineProgram {
@Override
public int execute() throws IOException {
final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile);
final ReferenceSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile);
final GenomeLocParser genomeLocParser = new GenomeLocParser(reference);
final SAMFileReader reader = new SAMFileReader(samFile);

View File

@ -25,8 +25,8 @@
package org.broadinstitute.gatk.utils.sam;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.NGSPlatform;
@ -52,7 +52,7 @@ import java.util.*;
public class ArtificialBAMBuilder {
public final static int BAM_SHARD_SIZE = 16384;
private final IndexedFastaSequenceFile reference;
private final ReferenceSequenceFile reference;
private final GenomeLocParser parser;
final int nReadsPerLocus;
@ -73,7 +73,7 @@ public class ArtificialBAMBuilder {
SAMFileHeader header;
public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) {
public ArtificialBAMBuilder(final ReferenceSequenceFile reference, int nReadsPerLocus, int nLoci) {
this.nReadsPerLocus = nReadsPerLocus;
this.nLoci = nLoci;
@ -94,7 +94,7 @@ public class ArtificialBAMBuilder {
createAndSetHeader(1);
}
public IndexedFastaSequenceFile getReference() {
public ReferenceSequenceFile getReference() {
return reference;
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.utils.text;
import htsjdk.samtools.sra.SRAAccession;
import org.broadinstitute.gatk.utils.commandline.ParsingEngine;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.commandline.Tags;
@ -59,6 +60,13 @@ public class ListFileUtils {
* @return a flattened list of the bam files provided
*/
public static List<SAMReaderID> unpackBAMFileList(final List<String> samFiles, final ParsingEngine parser) {
if ( samFiles == null ) {
throw new IllegalArgumentException("The SAM files are null");
}
if ( parser == null ) {
throw new IllegalArgumentException("The parser is null");
}
List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
for( String inputFileName: samFiles ) {
Tags inputFileNameTags = parser.getTags(inputFileName);
@ -79,10 +87,14 @@ public class ListFileUtils {
else if(inputFileName.endsWith("stdin")) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else if(SRAAccession.isValid(inputFileName)) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else {
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension and lists of BAM/CRAM files " +
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM/CRAM file or list " +
"of BAM/CRAM files is in the correct format, update the extension, and try again.",inputFileName));
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension " + "" +
"or SRA archives and lists of BAM/CRAM/SRA files with the .list extension, but the file %s has " +
"neither extension and is not SRA accession. Please ensure that your BAM/CRAM file or list " +
"of BAM/CRAM files is in the correct format, update the extension, and try again.", inputFileName));
}
}
return unpackedReads;

View File

@ -29,10 +29,10 @@ package org.broadinstitute.gatk.utils;
// the imports for unit testing.
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
@ -57,7 +57,7 @@ public class ExampleToCopyUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
// example fasta index file, can be deleted if you don't use the reference
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils;
import com.google.caliper.Param;
import com.google.caliper.SimpleBenchmark;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
@ -36,7 +36,7 @@ import java.io.File;
* Caliper microbenchmark of genome loc parser
*/
public class GenomeLocParserBenchmark extends SimpleBenchmark {
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private final int ITERATIONS = 1000000;
@Param({"NEW", "NONE"})

View File

@ -28,10 +28,9 @@ package org.broadinstitute.gatk.utils;
// the imports for unit testing.
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
@ -53,7 +52,7 @@ public class NGSPlatformUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
// example fasta index file, can be deleted if you don't use the reference
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -29,8 +29,8 @@ package org.broadinstitute.gatk.utils.activeregion;
// the imports for unit testing.
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -51,7 +51,7 @@ import java.util.*;
public class ActiveRegionUnitTest extends BaseTest {
private final static boolean DEBUG = false;
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private String contig;
private int contigLength;

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.utils.baq;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.Test;
@ -52,7 +53,7 @@ public class BAQUnitTest extends BaseTest {
private final int startChr = 1;
private final int numChr = 2;
private final int chrSize = 1000;
IndexedFastaSequenceFile fasta = null;
ReferenceSequenceFile fasta = null;
@BeforeMethod
public void before() {

View File

@ -32,6 +32,7 @@ package org.broadinstitute.gatk.utils.fasta;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Priority;
import org.broadinstitute.gatk.utils.BaseTest;
@ -93,7 +94,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) {
@ -123,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
// Tests grabbing sequences around a middle cached value.
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
@ -191,7 +192,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
// make sure some bases are lower case and some are upper case
@Test(enabled = true)
public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException {
final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
final ReferenceSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true);
final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA));
@ -208,9 +209,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state");
}
private int testCases(final IndexedFastaSequenceFile original,
final IndexedFastaSequenceFile casePreserving,
final IndexedFastaSequenceFile allUpper,
private int testCases(final ReferenceSequenceFile original,
final ReferenceSequenceFile casePreserving,
final ReferenceSequenceFile allUpper,
final String contig, final int start, final int stop ) {
final String orig = fetchBaseString(original, contig, start, stop);
final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
@ -226,7 +227,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
}
}
private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) {
private String fetchBaseString(final ReferenceSequenceFile reader, final String contig, final int start, final int stop) {
if ( start == -1 )
return new String(reader.getSequence(contig).getBases());
else

View File

@ -25,8 +25,7 @@
package org.broadinstitute.gatk.utils.refdata.tracks;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.gatk.utils.BaseTest;
@ -66,7 +65,7 @@ public class FeatureManagerUnitTest extends BaseTest {
public void setup() {
File referenceFile = new File(b36KGReference);
try {
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile);
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile);
genomeLocParser = new GenomeLocParser(seq);
manager = new FeatureManager();
}

View File

@ -26,7 +26,7 @@
package org.broadinstitute.gatk.utils.refdata.tracks;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Tribble;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.util.LittleEndianOutputStream;
@ -54,7 +54,7 @@ import java.nio.channels.FileChannel;
*/
public class RMDTrackBuilderUnitTest extends BaseTest {
private RMDTrackBuilder builder;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
@BeforeMethod

View File

@ -25,7 +25,7 @@
package org.broadinstitute.gatk.utils.refdata.utils;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
@ -43,7 +43,7 @@ public class FeatureToGATKFeatureIteratorUnitTest extends BaseTest {
@SuppressWarnings("unchecked")
public void testCloseFilePointers() throws IOException {
final String chr = "20";
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference));
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference));
GenomeLocParser parser = new GenomeLocParser(seq);
File file = new File(privateTestDir + "NA12878.hg19.example1.vcf");
VCFCodec codec = new VCFCodec();

View File

@ -26,8 +26,8 @@
package org.broadinstitute.gatk.utils.sam;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.Utils;
@ -223,7 +223,7 @@ public class ReadUtilsUnitTest extends BaseTest {
@Test (enabled = true)
public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
final int readLength = 76;
@ -239,7 +239,7 @@ public class ReadUtilsUnitTest extends BaseTest {
@Test (enabled = true)
public void testReadWithNsRefAfterDeletion() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
final int readLength = 76;