diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index 4205ed2d2..4b621b41d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.engine.recalibration.*; import org.broadinstitute.gatk.engine.walkers.*; @@ -194,7 +194,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector - private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private ReferenceSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index b284b56a6..80a95239c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFConstants; @@ -487,7 +488,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private HaplotypeCallerGenotypingEngine genotypingEngine = null; // fasta reference reader to supplement the edges of the reference sequence - protected CachingIndexedFastaSequenceFile referenceReader; + protected ReferenceSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 500; @@ -683,12 +684,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet)); - try { - // fasta reference reader to supplement the edges of the reference sequence - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = CachingIndexedFastaSequenceFile.checkAndCreate(getToolkit().getArguments().referenceFile); // create and setup the assembler assemblyEngine = new ReadThreadingAssembler(RTAC.maxNumHaplotypesInPopulation, RTAC.kmerSizes, RTAC.dontIncreaseKmerSizesForCycles, RTAC.allowNonUniqueKmersInRef, RTAC.numPruningSamples); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java index 010f3bb72..8008eaf86 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -71,7 +71,7 @@ import java.util.*; public class VariantOverlapAnnotatorUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java index 1a304891f..26f6891a4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java @@ -76,5 +76,4 @@ public class BaseQualitySumPerAlleleBySampleUnitTest { Assert.assertFalse(a.isUsableRead(read)); } - } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 747609ace..e34f734e3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.variantcontext.VariantContext; @@ -207,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { private void HCTestNearbySmallIntervals(String bam, String args, String md5) { try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 28690d993..4a0f9b9f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -51,11 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -80,7 +80,7 @@ import java.util.*; public class LocalAssemblyEngineUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private SAMFileHeader header; @BeforeClass diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java index 60f7f0fdf..410b895ac 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -51,9 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.indels; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -76,7 +75,7 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java index be669cca2..4b2ac1b88 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.utils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -71,7 +71,7 @@ public class ContigComparatorUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { // sequence - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser genomeLocParser = new GenomeLocParser(seq); dictForFails = genomeLocParser.getContigs(); } @@ -81,7 +81,7 @@ public class ContigComparatorUnitTest extends BaseTest { List tests = new ArrayList(); for ( final String ref : Arrays.asList(b37KGReference, hg18Reference) ) { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref)); final GenomeLocParser genomeLocParser = new GenomeLocParser(seq); final SAMSequenceDictionary dict = genomeLocParser.getContigs(); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 95e4806fa..b9656ebdb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.utils.genotyper; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.BaseUtils; @@ -64,7 +65,6 @@ import java.util.Map; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -81,7 +81,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java index 73dc0749c..9598349bb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java @@ -37,6 +37,7 @@ import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.utils.text.ListFileUtils; +import htsjdk.samtools.sra.SRAAccession; import java.util.ArrayList; import java.util.Arrays; @@ -63,10 +64,19 @@ public abstract class CommandLineExecutable extends CommandLineProgram { /** * A list of all the arguments initially used as sources. */ - private final Collection argumentSources = new ArrayList(); + private final Collection argumentSources = new ArrayList<>(); protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + private final static String SRA_LIBS_DOWNLOAD = "samjdk.sra_libraries_download"; + /** + * Set GATK version to be used as part of user agent for network requests + */ + static { + System.setProperty(SRA_LIBS_DOWNLOAD, "true"); + SRAAccession.setAppVersionString("GATK " + getVersionNumber()); + } + /** * this is the function that the inheriting class can expect to have called * when the command line system has initialized. diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 469e5491d..85c535c1f 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -905,7 +905,7 @@ public class GenomeAnalysisEngine { * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, - final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { + final ReferenceSequenceFile refReader, final Map sampleRenameMap) { DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); // Synchronize the method back into the collection so that it shows up when diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java index fe3f5052c..db098e0e3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.alignment.bwa.java; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.alignment.Aligner; import org.broadinstitute.gatk.engine.alignment.Alignment; import org.broadinstitute.gatk.utils.BaseUtils; @@ -126,7 +127,7 @@ public class AlignerTestHarness { mismatches++; - IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION), read.getAlignmentStart(), diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java index 227675eea..20615c4d6 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.ReadProperties; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; @@ -62,7 +62,7 @@ public class LocusShardDataProvider extends ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public LocusShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { + public LocusShardDataProvider(final Shard shard, final ReadProperties sourceInfo, final GenomeLocParser genomeLocParser, final GenomeLoc locus, final LocusIterator locusIterator, final ReferenceSequenceFile reference, final Collection rods) { super(shard,genomeLocParser,reference,rods); this.sourceInfo = sourceInfo; this.locus = locus; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java index 15f861c20..0483f5bc4 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; @@ -50,7 +51,7 @@ public class ReadShardDataProvider extends ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { + public ReadShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final GATKSAMIterator reads, final ReferenceSequenceFile reference, final Collection rods) { super(shard,genomeLocParser,reference,rods); this.reads = reads; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java index e5932bd94..4e8eae8c5 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java @@ -29,6 +29,7 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; @@ -61,13 +62,13 @@ public class ReferenceView implements View { /** * The source of reference data. */ - protected IndexedFastaSequenceFile reference = null; + protected ReferenceSequenceFile reference = null; /** * Create a new ReferenceView. * @param provider */ - public ReferenceView( ShardDataProvider provider ) { + public ReferenceView( final ShardDataProvider provider ) { this.genomeLocParser = provider.getGenomeLocParser(); this.reference = provider.getReference(); } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java index 27c71173a..c1b663f8d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -70,7 +70,7 @@ public abstract class ShardDataProvider { /** * Provider of reference data for this particular shard. */ - private final IndexedFastaSequenceFile reference; + private final ReferenceSequenceFile reference; /** * Sources of reference-ordered data. @@ -106,7 +106,7 @@ public abstract class ShardDataProvider { * Gets a pointer into the given indexed fasta sequence file. * @return The indexed fasta sequence file. */ - IndexedFastaSequenceFile getReference() { + ReferenceSequenceFile getReference() { return reference; } @@ -131,7 +131,7 @@ public abstract class ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public ShardDataProvider(Shard shard,GenomeLocParser genomeLocParser,IndexedFastaSequenceFile reference,Collection rods) { + public ShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final ReferenceSequenceFile reference, final Collection rods) { this.shard = shard; this.genomeLocParser = genomeLocParser; this.reference = reference; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java index 394923b1e..7f954d82e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java @@ -25,20 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.*; -import htsjdk.samtools.seekablestream.SeekableBufferedStream; -import htsjdk.samtools.seekablestream.SeekableFileStream; -import htsjdk.samtools.seekablestream.SeekableStream; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import htsjdk.samtools.Bin; /** * A basic interface for querying BAM indices. @@ -47,116 +34,24 @@ import java.util.List; * @author mhanna * @version 0.1 */ -public class GATKBAMIndex { +public abstract class GATKBAMIndex { + /** - * BAM index file magic number. + * What is the starting bin for each level? */ - private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + protected static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; /** * Reports the total amount of genomic data that any bin can index. */ protected static final int BIN_GENOMIC_SPAN = 512*1024*1024; - /** - * What is the starting bin for each level? - */ - private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; - /** * Reports the maximum number of bins that can appear in a BAM file. */ public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 - private final SAMSequenceDictionary sequenceDictionary; - private final File mFile; - - //TODO: figure out a good value for this buffer size - private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192; - - /** - * Number of sequences stored in this index. - */ - private final int sequenceCount; - - /** - * A cache of the starting positions of the sequences. - */ - private final long[] sequenceStartCache; - - private SeekableFileStream fileStream; - private SeekableStream baiStream; - private SeekableBufferedStream bufferedStream; - private long fileLength; - - public GATKBAMIndex(final File file, final SAMSequenceDictionary sequenceDictionary) { - mFile = file; - this.sequenceDictionary = sequenceDictionary; - - // Open the file stream. - openIndexFile(); - - // Verify the magic number. - seek(0); - final byte[] buffer = readBytes(4); - if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { - throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + - ": " + new String(buffer)); - } - - seek(4); - - sequenceCount = readInteger(); - - // Create a cache of the starting position of each sequence. Initialize it to -1. - sequenceStartCache = new long[sequenceCount]; - for(int i = 1; i < sequenceCount; i++) - sequenceStartCache[i] = -1; - - // Seed the first element in the array with the current position. - if(sequenceCount > 0) - sequenceStartCache[0] = position(); - - closeIndexFile(); - } - - public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { - openIndexFile(); - - if (referenceSequence >= sequenceCount) - throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); - - skipToSequence(referenceSequence); - - int binCount = readInteger(); - List bins = new ArrayList<>(); - for (int binNumber = 0; binNumber < binCount; binNumber++) { - final int indexBin = readInteger(); - final int nChunks = readInteger(); - - List chunks = new ArrayList<>(nChunks); - long[] rawChunkData = readLongs(nChunks*2); - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = rawChunkData[ci*2]; - final long chunkEnd = rawChunkData[ci*2+1]; - chunks.add(new GATKChunk(chunkBegin, chunkEnd)); - } - GATKBin bin = new GATKBin(referenceSequence, indexBin); - bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); - while(indexBin >= bins.size()) - bins.add(null); - bins.set(indexBin,bin); - } - - final int nLinearBins = readInteger(); - long[] linearIndexEntries = readLongs(nLinearBins); - - LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); - - closeIndexFile(); - - return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); - } + public abstract GATKBAMIndexData readReferenceSequence(final int referenceSequence); /** * Get the number of levels employed by this index. @@ -180,91 +75,35 @@ public class GATKBAMIndex { * @param levelNumber Level number. 0-based. * @return The size (number of possible bins) of the given level. */ - public int getLevelSize(final int levelNumber) { - if(levelNumber == getNumIndexLevels()-1) - return MAX_BINS-LEVEL_STARTS[levelNumber]-1; - else - return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; - } + public abstract int getLevelSize(final int levelNumber); /** * Gets the level associated with the given bin number. * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ - public int getLevelForBin(final Bin bin) { - GATKBin gatkBin = new GATKBin(bin); - if(gatkBin.getBinNumber() >= MAX_BINS) - throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); - for(int i = getNumIndexLevels()-1; i >= 0; i--) { - if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) - return i; - } - throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); - } + public abstract int getLevelForBin(final Bin bin); /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ - public int getFirstLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; - } + public abstract int getFirstLocusInBin(final Bin bin); /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ - public int getLastLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); - } + public abstract int getLastLocusInBin(final Bin bin); /** * Use to get close to the unmapped reads at the end of a BAM file. * @return The file offset of the first record in the last linear bin, or -1 * if there are no elements in linear bins (i.e. no mapped reads). */ - public long getStartOfLastLinearBin() { - openIndexFile(); - - seek(4); - - final int sequenceCount = readInteger(); - // Because no reads may align to the last sequence in the sequence dictionary, - // grab the last element of the linear index for each sequence, and return - // the last one from the last sequence that has one. - long lastLinearIndexPointer = -1; - for (int i = 0; i < sequenceCount; i++) { - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j1 = 0; j1 < nBins; j1++) { - // Skip bin # - skipBytes(4); - final int nChunks = readInteger(); - // Skip chunks - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - if (nLinearBins > 0) { - // Skip to last element of list of linear bins - skipBytes(8 * (nLinearBins - 1)); - lastLinearIndexPointer = readLongs(1)[0]; - } - } - - closeIndexFile(); - - return lastLinearIndexPointer; - } + public abstract long getStartOfLastLinearBin(); /** * Gets the possible number of bins for a given reference sequence. @@ -273,197 +112,4 @@ public class GATKBAMIndex { protected int getMaxAddressibleGenomicLocation() { return BIN_GENOMIC_SPAN; } - - protected void skipToSequence(final int referenceSequence) { - // Find the offset in the file of the last sequence whose position has been determined. Start here - // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always - // be present, so no extra stopping condition is necessary. - int sequenceIndex = referenceSequence; - while(sequenceStartCache[sequenceIndex] == -1) - sequenceIndex--; - - // Advance to the most recently found position. - seek(sequenceStartCache[sequenceIndex]); - - for (int i = sequenceIndex; i < referenceSequence; i++) { - sequenceStartCache[i] = position(); - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j = 0; j < nBins; j++) { - /* final int bin = */ - readInteger(); - final int nChunks = readInteger(); - // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(8 * nLinearBins); - - } - - sequenceStartCache[referenceSequence] = position(); - } - - - - private void openIndexFile() { - try { - fileStream = new SeekableFileStream(mFile); - baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary); - bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE); - fileLength=bufferedStream.length(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); - } - } - - private void closeIndexFile() { - try { - bufferedStream.close(); - baiStream.close(); - fileStream.close(); - fileLength = -1; - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to close index file " + mFile, exc); - } - } - - private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - private byte[] readBytes(int count) { - ByteBuffer buffer = getBuffer(count); - read(buffer); - buffer.flip(); - byte[] contents = new byte[count]; - buffer.get(contents); - return contents; - } - - private int readInteger() { - ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - return buffer.getInt(); - } - - /** - * Reads an array of longs from the file channel, returning the results as an array. - * @param count Number of longs to read. - * @return An array of longs. Size of array should match count. - */ - private long[] readLongs(final int count) { - ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - long[] result = new long[count]; - for(int i = 0; i < count; i++) - result[i] = buffer.getLong(); - return result; - } - - private void read(final ByteBuffer buffer) { - final int bytesRequested = buffer.limit(); - if (bytesRequested == 0) - return; - - try { - - //BufferedInputStream cannot read directly into a byte buffer, so we read into an array - //and put the result into the bytebuffer after the if statement. - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if there isn't enough data in the file, the index - // must be truncated or otherwise corrupt: - if(bytesRequested > fileLength - bufferedStream.position()){ - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - - int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested); - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if we encounter EOF (-1), the index - // must be truncated or otherwise corrupt: - if (bytesRead <= 0) { - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - - if(bytesRead != bytesRequested) - throw new RuntimeException("Read amount different from requested amount. This should not happen."); - - buffer.put(byteArray, 0, bytesRequested); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); - } - } - - - /** - * A reusable buffer for use by this index generator. - * TODO: Should this be a SoftReference? - */ - private ByteBuffer buffer = null; - - //BufferedStream don't read into ByteBuffers, so we need this temporary array - private byte[] byteArray=null; - private ByteBuffer getBuffer(final int size) { - if(buffer == null || buffer.capacity() < size) { - // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. - buffer = ByteBuffer.allocate(size); - byteArray = new byte[size]; - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - buffer.clear(); - buffer.limit(size); - return buffer; - } - - private void skipBytes(final int count) { - try { - - //try to skip forward the requested amount. - long skipped = bufferedStream.skip(count); - - if( skipped != count ) { //if not managed to skip the requested amount - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - - private void seek(final long position) { - try { - //to seek a new position, move the fileChannel, and reposition the bufferedStream - bufferedStream.seek(position); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); - } - } - - /** - * Retrieve the position from the current file channel. - * @return position of the current file channel. - */ - private long position() { - try { - return bufferedStream.position(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); - } - } } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java new file mode 100644 index 000000000..5f3f92b72 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java @@ -0,0 +1,109 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * This class implements BAM index querying API + * by wrapping a class BrowseableBAMIndex from HTSJDK. + * + * @version 0.1 + */ +public class GATKBAMIndexFromDataSource extends GATKBAMIndex { + private File sourceFile; + private SAMFileHeader sourceHeader; + private BrowseableBAMIndex index; + + public GATKBAMIndexFromDataSource(final File sourceFile, final SAMFileHeader sourceHeader, final BrowseableBAMIndex index) { + this.sourceFile = sourceFile; + this.sourceHeader = sourceHeader; + this.index = index; + } + + @Override + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + final List sequences = sourceHeader.getSequenceDictionary().getSequences(); + if (referenceSequence >= sequences.size()) + throw new ReviewedGATKException("Sequence number " + referenceSequence + " cannot be greater or equal to " + sequences.size() + " in index file " + sourceFile); + + + final BinList sourceBins = index.getBinsOverlapping(referenceSequence, 0, sequences.get(referenceSequence).getSequenceLength()); + + final List bins = new ArrayList<>(); + for (Bin sourceBin : sourceBins) { + final int indexBin = sourceBin.getBinNumber(); + while(indexBin >= bins.size()) + bins.add(null); + + final GATKBin bin = new GATKBin(referenceSequence, indexBin); + final List chunks = index.getSpanOverlapping(sourceBin).getChunks(); + final List gatkChunks = new ArrayList<>(chunks.size()); + for (Chunk chunk : chunks) { + gatkChunks.add(new GATKChunk(chunk)); + } + + bin.setChunkList(gatkChunks.toArray(new GATKChunk[gatkChunks.size()])); + + bins.set(indexBin, bin); + } + + // there is no interface to get linear index from HTSJDK + final LinearIndex linearIndex = new LinearIndex(referenceSequence, 0, new long[]{}); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); + } + + @Override + public int getLevelSize(int levelNumber) { + return index.getLevelSize(levelNumber); + } + + @Override + public int getLevelForBin(Bin bin) { + return index.getLevelForBin(bin); + } + + @Override + public int getFirstLocusInBin(Bin bin) { + return index.getFirstLocusInBin(bin); + } + + @Override + public int getLastLocusInBin(Bin bin) { + return index.getLastLocusInBin(bin); + } + + @Override + public long getStartOfLastLinearBin() { + return index.getStartOfLastLinearBin(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java new file mode 100644 index 000000000..f1498a63a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java @@ -0,0 +1,435 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import htsjdk.samtools.seekablestream.SeekableBufferedStream; +import htsjdk.samtools.seekablestream.SeekableFileStream; +import htsjdk.samtools.seekablestream.SeekableStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * This class implements BAM index querying API + * by reading .bai file directly. + * Very much not thread-safe. + * + * @author mhanna + * @version 0.1 + */ +public class GATKBAMIndexFromFile extends GATKBAMIndex { + /** + * BAM index file magic number. + */ + private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + + private final SAMSequenceDictionary sequenceDictionary; + private final File mFile; + + //TODO: figure out a good value for this buffer size + private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192; + + /** + * Number of sequences stored in this index. + */ + private final int sequenceCount; + + /** + * A cache of the starting positions of the sequences. + */ + private final long[] sequenceStartCache; + + private SeekableFileStream fileStream; + private SeekableStream baiStream; + private SeekableBufferedStream bufferedStream; + private long fileLength; + + public GATKBAMIndexFromFile(final File file, final SAMSequenceDictionary sequenceDictionary) { + mFile = file; + this.sequenceDictionary = sequenceDictionary; + + // Open the file stream. + openIndexFile(); + + // Verify the magic number. + seek(0); + final byte[] buffer = readBytes(4); + if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { + throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + + ": " + new String(buffer)); + } + + seek(4); + + sequenceCount = readInteger(); + + // Create a cache of the starting position of each sequence. Initialize it to -1. + sequenceStartCache = new long[sequenceCount]; + for(int i = 1; i < sequenceCount; i++) + sequenceStartCache[i] = -1; + + // Seed the first element in the array with the current position. + if(sequenceCount > 0) + sequenceStartCache[0] = position(); + + closeIndexFile(); + } + + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + openIndexFile(); + + if (referenceSequence >= sequenceCount) + throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); + + skipToSequence(referenceSequence); + + final int binCount = readInteger(); + List bins = new ArrayList<>(); + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(); + final int nChunks = readInteger(); + + final List chunks = new ArrayList<>(nChunks); + final long[] rawChunkData = readLongs(nChunks*2); + for (int ci = 0; ci < nChunks; ci++) { + final long chunkBegin = rawChunkData[ci*2]; + final long chunkEnd = rawChunkData[ci*2+1]; + chunks.add(new GATKChunk(chunkBegin, chunkEnd)); + } + final GATKBin bin = new GATKBin(referenceSequence, indexBin); + bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); + while(indexBin >= bins.size()) + bins.add(null); + bins.set(indexBin,bin); + } + + final int nLinearBins = readInteger(); + long[] linearIndexEntries = readLongs(nLinearBins); + + final LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); + + closeIndexFile(); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); + } + + /** + * Gets the number of bins in the given level. + * @param levelNumber Level number. 0-based. + * @return The size (number of possible bins) of the given level. + */ + @Override + public int getLevelSize(final int levelNumber) { + if(levelNumber == getNumIndexLevels()-1) + return MAX_BINS-LEVEL_STARTS[levelNumber]-1; + else + return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; + } + + /** + * Gets the level associated with the given bin number. + * @param bin The bin for which to determine the level. + * @return the level associated with the given bin number. + */ + @Override + public int getLevelForBin(final Bin bin) { + final GATKBin gatkBin = new GATKBin(bin); + if(gatkBin.getBinNumber() >= MAX_BINS) + throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); + for(int i = getNumIndexLevels()-1; i >= 0; i--) { + if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) + return i; + } + throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); + } + + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + @Override + public int getFirstLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + @Override + public int getLastLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); + } + + /** + * Use to get close to the unmapped reads at the end of a BAM file. + * @return The file offset of the first record in the last linear bin, or -1 + * if there are no elements in linear bins (i.e. no mapped reads). + */ + @Override + public long getStartOfLastLinearBin() { + openIndexFile(); + + seek(4); + + final int sequenceCount = readInteger(); + // Because no reads may align to the last sequence in the sequence dictionary, + // grab the last element of the linear index for each sequence, and return + // the last one from the last sequence that has one. + long lastLinearIndexPointer = -1; + for (int i = 0; i < sequenceCount; i++) { + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j1 = 0; j1 < nBins; j1++) { + // Skip bin # + skipBytes(4); + final int nChunks = readInteger(); + // Skip chunks + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + if (nLinearBins > 0) { + // Skip to last element of list of linear bins + skipBytes(8 * (nLinearBins - 1)); + lastLinearIndexPointer = readLongs(1)[0]; + } + } + + closeIndexFile(); + + return lastLinearIndexPointer; + } + + protected void skipToSequence(final int referenceSequence) { + // Find the offset in the file of the last sequence whose position has been determined. Start here + // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always + // be present, so no extra stopping condition is necessary. + int sequenceIndex = referenceSequence; + while(sequenceStartCache[sequenceIndex] == -1) + sequenceIndex--; + + // Advance to the most recently found position. + seek(sequenceStartCache[sequenceIndex]); + + for (int i = sequenceIndex; i < referenceSequence; i++) { + sequenceStartCache[i] = position(); + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j = 0; j < nBins; j++) { + /* final int bin = */ + readInteger(); + final int nChunks = readInteger(); + // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + // System.out.println("# nLinearBins: " + nLinearBins); + skipBytes(8 * nLinearBins); + + } + + sequenceStartCache[referenceSequence] = position(); + } + + + + private void openIndexFile() { + try { + fileStream = new SeekableFileStream(mFile); + baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary); + bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE); + fileLength=bufferedStream.length(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); + } + } + + private void closeIndexFile() { + try { + bufferedStream.close(); + baiStream.close(); + fileStream.close(); + fileLength = -1; + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to close index file " + mFile, exc); + } + } + + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + private byte[] readBytes(int count) { + final ByteBuffer buffer = getBuffer(count); + read(buffer); + buffer.flip(); + byte[] contents = new byte[count]; + buffer.get(contents); + return contents; + } + + private int readInteger() { + final ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + return buffer.getInt(); + } + + /** + * Reads an array of longs from the file channel, returning the results as an array. + * @param count Number of longs to read. + * @return An array of longs. Size of array should match count. + */ + private long[] readLongs(final int count) { + final ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + long[] result = new long[count]; + for(int i = 0; i < count; i++) + result[i] = buffer.getLong(); + return result; + } + + private void read(final ByteBuffer buffer) { + final int bytesRequested = buffer.limit(); + if (bytesRequested == 0) + return; + + try { + + //BufferedInputStream cannot read directly into a byte buffer, so we read into an array + //and put the result into the bytebuffer after the if statement. + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if there isn't enough data in the file, the index + // must be truncated or otherwise corrupt: + if(bytesRequested > fileLength - bufferedStream.position()){ + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested); + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if we encounter EOF (-1), the index + // must be truncated or otherwise corrupt: + if (bytesRead <= 0) { + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + if(bytesRead != bytesRequested) + throw new RuntimeException("Read amount different from requested amount. This should not happen."); + + buffer.put(byteArray, 0, bytesRequested); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); + } + } + + + /** + * A reusable buffer for use by this index generator. + * TODO: Should this be a SoftReference? + */ + private ByteBuffer buffer = null; + + //BufferedStream don't read into ByteBuffers, so we need this temporary array + private byte[] byteArray=null; + private ByteBuffer getBuffer(final int size) { + if(buffer == null || buffer.capacity() < size) { + // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. + buffer = ByteBuffer.allocate(size); + byteArray = new byte[size]; + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + buffer.clear(); + buffer.limit(size); + return buffer; + } + + private void skipBytes(final int count) { + try { + + //try to skip forward the requested amount. + final long skipped = bufferedStream.skip(count); + + if( skipped != count ) { //if not managed to skip the requested amount + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + + private void seek(final long position) { + try { + //to seek a new position, move the fileChannel, and reposition the bufferedStream + bufferedStream.seek(position); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); + } + } + + /** + * Retrieve the position from the current file channel. + * @return position of the current file channel. + */ + private long position() { + try { + return bufferedStream.position(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java index dd662f01d..76d309bcb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java @@ -28,7 +28,10 @@ package org.broadinstitute.gatk.engine.datasources.reads; import htsjdk.samtools.MergingSamRecordIterator; import htsjdk.samtools.SamFileHeaderMerger; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.sra.SRAAccession; +import htsjdk.samtools.sra.SRAIndexedSequenceFile; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.RuntimeIOException; @@ -51,7 +54,6 @@ import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; import org.broadinstitute.gatk.utils.sam.SAMReaderID; @@ -310,7 +312,7 @@ public class SAMDataSource { // Determine the sort order. for(SAMReaderID readerID: readerIDs) { - if (! readerID.getSamFile().canRead() ) + if (!SRAAccession.isValid(readerID.getSamFile().getPath()) && !readerID.getSamFile().canRead() ) throw new UserException.CouldNotReadInputFile(readerID.getSamFile(),"file is not present or user does not have appropriate permissions. " + "Please check that the file is present and readable and try again."); @@ -377,15 +379,28 @@ public class SAMDataSource { if (referenceFile == null) { samSequenceDictionary = mergedHeader.getSequenceDictionary(); } else { - samSequenceDictionary = ReferenceSequenceFileFactory. - getReferenceSequenceFile(referenceFile). - getSequenceDictionary(); + ReferenceSequenceFile ref; + // maybe it is SRA file? + if (SRAAccession.isValid(referenceFile.getPath())) { + ref = new SRAIndexedSequenceFile(new SRAAccession(referenceFile.getPath())); + } else { + ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); + } + samSequenceDictionary = ref.getSequenceDictionary(); } for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.getSamFile()); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile, samSequenceDictionary)); + if(indexFile != null) { + bamIndices.put(id, new GATKBAMIndexFromFile(indexFile, samSequenceDictionary)); + continue; + } + + // if index file is not found, try SamReader indexing interface + SamReader reader = readers.getReader(id); + if (reader.indexing().hasBrowseableIndex()) { + bamIndices.put(id, new GATKBAMIndexFromDataSource(id.getSamFile(), reader.getFileHeader(), reader.indexing().getBrowseableIndex())); + } } resourcePool.releaseReaders(readers); @@ -1122,12 +1137,17 @@ public class SAMDataSource { try { if (threadAllocation.getNumIOThreads() > 0) blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = SamReaderFactory.makeDefault() - .referenceSequence(referenceFile) + SamReaderFactory factory = SamReaderFactory.makeDefault() .validationStringency(validationStringency) - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false); + + if (SRAAccession.isValid(readerID.getSamFile().getPath())) { + reader = factory.open(SamInputResource.of(new SRAAccession(readerID.getSamFile().getPath()))); + } else { + reader = factory.referenceSequence(referenceFile) .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true) .open(readerID.getSamFile()); + } } catch ( RuntimeIOException e ) { throw new UserException.CouldNotReadInputFile(readerID.getSamFile(), e); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java index 7dace7bc5..377a826e3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.reads.utilities; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.commandline.Input; @@ -88,12 +89,12 @@ public class FindLargeShards extends CommandLineProgram { @Override public int execute() throws IOException { // initialize reference - IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); - GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); + final ReferenceSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); + final GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser); + final SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser); // intervals final GenomeLocSortedSet intervalSortedSet; @@ -124,12 +125,12 @@ public class FindLargeShards extends CommandLineProgram { } // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N - long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); - long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); + final long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); + final long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); // Crank through the shards again, this time reporting on the shards significantly larger than the mean. - long threshold = mean + stddev*5; + final long threshold = mean + stddev*5; logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java index 5dc159e27..a6189d45c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine.datasources.reference; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; import org.broadinstitute.gatk.engine.datasources.reads.Shard; @@ -46,7 +46,7 @@ import java.util.List; * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -55,7 +55,7 @@ public class ReferenceDataSource { * Create reference data source from fasta file * @param fastaFile Fasta file to be used as reference */ - public ReferenceDataSource(File fastaFile) { + public ReferenceDataSource(final File fastaFile) { reference = CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile); } @@ -63,7 +63,7 @@ public class ReferenceDataSource { * Get indexed fasta file * @return IndexedFastaSequenceFile that was created from file */ - public IndexedFastaSequenceFile getReference() { + public ReferenceSequenceFile getReference() { return this.reference; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java index 856c931b1..a0daa1cf6 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.executive; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.TribbleException; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; @@ -111,7 +111,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { super(engine, walker, reads, reference, rods, threadAllocation); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java index 22ca49300..f35acf77c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.executive; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; @@ -63,7 +63,7 @@ public class LinearMicroScheduler extends MicroScheduler { protected LinearMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { super(engine, walker, reads, reference, rods, threadAllocation); @@ -84,7 +84,6 @@ public class LinearMicroScheduler extends MicroScheduler { Accumulator accumulator = Accumulator.create(engine,walker); boolean done = walker.isDone(); - int counter = 0; final TraversalEngine traversalEngine = borrowTraversalEngine(this); for (Shard shard : shardStrategy ) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java index 5da39245e..f3f28807c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.engine.executive; import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.ReadMetrics; @@ -104,7 +104,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ protected final GenomeAnalysisEngine engine; - protected final IndexedFastaSequenceFile reference; + protected final ReferenceSequenceFile reference; private final SAMDataSource reads; protected final Collection rods; @@ -131,7 +131,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + public static MicroScheduler create(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final ReferenceSequenceFile reference, + final Collection rods, final ThreadAllocation threadAllocation) { if ( threadAllocation.isRunningInParallelMode() ) { logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", @@ -183,7 +184,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { protected MicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { this.engine = engine; @@ -397,7 +398,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * Returns the reference maintained by this scheduler. * @return The reference maintained by this scheduler. */ - public IndexedFastaSequenceFile getReference() { return reference; } + public ReferenceSequenceFile getReference() { return reference; } protected void cleanup() { try { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java index af3432f8b..bbe17ba6e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.filters; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.WalkerManager; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; @@ -41,7 +41,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; */ public class BAQReadTransformer extends ReadTransformer { private BAQ baqHMM; - private IndexedFastaSequenceFile refReader; + private ReferenceSequenceFile refReader; private BAQ.CalculationMode cmode; private BAQ.QualityMode qmode; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java index bb3145d6f..669fa4bb3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.walkers; import com.google.java.contract.Ensures; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -181,7 +182,7 @@ public abstract class ActiveRegionWalker extends Walker allIntervals = new ArrayList(); for( final GenomeLoc interval : intervals.toList() ) { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java index a5a77ca73..fc6f2b526 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -82,7 +82,7 @@ public class ReadMetricsUnitTest extends BaseTest { // Test the accuracy of the read metrics private File referenceFile; - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; private SAMSequenceDictionary dictionary; private SAMFileHeader header; private GATKSAMReadGroupRecord readGroup; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java index 3ca67276a..856167fea 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -71,7 +72,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { /** * Sequence file. */ - private static IndexedFastaSequenceFile seq; + private static ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; /** diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java index c113b3392..82d03edb5 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -57,7 +57,7 @@ public abstract class ReferenceViewTemplate extends BaseTest { /** * The fasta, for comparison. */ - protected IndexedFastaSequenceFile sequenceFile = null; + protected ReferenceSequenceFile sequenceFile = null; protected GenomeLocParser genomeLocParser = null; // diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java index fd59eb2d3..beb50c6eb 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.GATKBAMFileSpan; import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -45,7 +45,7 @@ import java.io.FileNotFoundException; * */ public class FilePointerUnitTest extends BaseTest { - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; private SAMReaderID readerID = new SAMReaderID("samFile",new Tags()); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java new file mode 100644 index 000000000..3cd4ac6e4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java @@ -0,0 +1,98 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Test basic functionality in the GATK's wrapper around htsjdk.samtools.BrowseableBAMIndex. + */ +public class GATKBAMIndexFromDataSourceUnitTest extends BaseTest { + private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam"); + + /** + * Storage for the index itself. + */ + private GATKBAMIndex bamIndex; + + @BeforeClass + public void init() throws IOException { + final SAMFileReader reader = new SAMFileReader(bamFile); + reader.enableIndexCaching(true); // needed ot get BrowseableBAMIndex + Assert.assertTrue(reader.hasIndex()); + Assert.assertTrue(reader.indexing().hasBrowseableIndex()); + + bamIndex = new GATKBAMIndexFromDataSource(bamFile, reader.getFileHeader(), reader.indexing().getBrowseableIndex()); + reader.close(); + } + + @Test + public void testNumberAndSizeOfIndexLevels() { + // The correct values for this test are pulled directly from the + // SAM Format Specification v1.3-r882, Section 4.1.1, last paragraph. + Assert.assertEquals(GATKBAMIndex.getNumIndexLevels(),6,"Incorrect number of levels in BAM index"); + + // Level 0 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(0),0); + Assert.assertEquals(bamIndex.getLevelSize(0),1); + + // Level 1 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(1),1); + Assert.assertEquals(bamIndex.getLevelSize(1),8-1+1); + + // Level 2 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(2),9); + Assert.assertEquals(bamIndex.getLevelSize(2),72-9+1); + + // Level 3 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(3),73); + Assert.assertEquals(bamIndex.getLevelSize(3),584-73+1); + + // Level 4 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(4),585); + Assert.assertEquals(bamIndex.getLevelSize(4),4680-585+1); + + // Level 5 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681); + // Need to wait unitl AbstractBAMFileIndex.getLevelSize() is fixed, will throw an ArrayIndexOutOfBoundsException if 1 before the end of the GenomicIndexUtil.LEVEL_STARTS array + //Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testTooManyReferenceSequencesDataSource() { + final GATKBAMIndexData index = bamIndex.readReferenceSequence(1); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java similarity index 85% rename from public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java index f6be513bf..c13a0006d 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java @@ -39,7 +39,7 @@ import java.io.FileNotFoundException; /** * Test basic functionality in the GATK's implementation of the BAM index classes. */ -public class GATKBAMIndexUnitTest extends BaseTest { +public class GATKBAMIndexFromFileUnitTest extends BaseTest { private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam"); /** @@ -60,11 +60,11 @@ public class GATKBAMIndexUnitTest extends BaseTest { @BeforeClass public void init() throws FileNotFoundException { - SAMFileReader reader = new SAMFileReader(bamFile); - this.sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); + final SAMFileReader reader = new SAMFileReader(bamFile); + sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); - - bamIndex = new GATKBAMIndex(bamIndexFile, sequenceDictionary); + + bamIndex = new GATKBAMIndexFromFile(bamIndexFile, sequenceDictionary); } @Test @@ -100,13 +100,13 @@ public class GATKBAMIndexUnitTest extends BaseTest { @Test( expectedExceptions = UserException.MalformedFile.class ) public void testDetectTruncatedBamIndexWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary); + final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } @Test( expectedExceptions = UserException.MalformedFile.class ) - public void testDetectTruncatedBamIndexNonWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary); + public void testDetectTruncatedBamIndeFromFilexNonWordBoundary() { + final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java index 606c3aa72..96731336a 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -64,7 +64,7 @@ public class SAMDataSourceUnitTest extends BaseTest { private List readers; private File referenceFile; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; /** diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 1ab027d06..a63ce6ee1 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.rmd; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; import org.testng.Assert; @@ -68,7 +69,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { private RMDTriplet triplet = null; private RMDTrackBuilder builder = null; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; private GenomeLoc testSite1; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java index 99cd61cbe..40357b442 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.rmd; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -46,7 +46,7 @@ public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{ // Build up query parameters File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf"); RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags()); - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); GenomeLocParser parser = new GenomeLocParser(seq); GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000); TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java index 6277646e6..bf625ad11 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.traversals; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialBAMBuilder; @@ -43,7 +43,7 @@ import java.util.List; public class TAROrderedReadCacheUnitTest extends BaseTest { // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java index d0b8c7170..f931ac40b 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.traversals; import com.google.java.contract.PreconditionError; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; import org.broadinstitute.gatk.engine.datasources.reads.*; @@ -39,7 +40,6 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.interval.IntervalUtils; import org.broadinstitute.gatk.utils.sam.*; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; @@ -80,7 +80,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private File referenceFile; - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; private SAMSequenceDictionary dictionary; private GenomeLocParser genomeLocParser; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java index 45bc8da46..d7a0e6b2e 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java @@ -25,7 +25,6 @@ package org.broadinstitute.gatk.engine.traversals; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; import org.broadinstitute.gatk.utils.BaseTest; @@ -90,7 +89,7 @@ public class TraverseReadsUnitTest extends BaseTest { private File output; private TraverseReadsNano traversalEngine = null; - private IndexedFastaSequenceFile ref = null; + private ReferenceSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; private GenomeAnalysisEngine engine = null; diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index d4daad80e..aa12e0180 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -418,6 +418,7 @@ ${gatk.queuetests.run} ${java.io.tmpdir} + sra diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java index 49983241d..5c855775f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -70,7 +70,7 @@ public class QCRef extends RefWalker { String contigName = ""; int contigStart, contigEnd; - IndexedFastaSequenceFile uncachedRef; + ReferenceSequenceFile uncachedRef; byte[] uncachedBases; @Override diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java index 1e97c388a..ef6ebd7b6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.tools.walkers.varianteval; import com.google.java.contract.Requires; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.util.IntervalTree; import htsjdk.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -282,7 +283,7 @@ public class VariantEval extends RodWalker implements TreeRedu private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this); // Ancestral alignments - private IndexedFastaSequenceFile ancestralAlignments = null; + private ReferenceSequenceFile ancestralAlignments = null; // The set of all possible evaluation contexts StratificationManager stratManager; diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java index 1e9a1dc81..3ec696592 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -51,7 +51,7 @@ public class VariantFiltrationUnitTest extends BaseTest { @BeforeTest public void before() { // Create GenomeLoc - IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta")); + ReferenceSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta")); GenomeLocParser genomeLocParser = new GenomeLocParser(fasta); chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName(); genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java index 801db5269..de62d5525 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.indels; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; @@ -44,7 +44,7 @@ public class IndelRealignerUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java new file mode 100644 index 000000000..6ebb0074b --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java @@ -0,0 +1,143 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.readutils; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.SkipException; +import org.testng.annotations.BeforeGroups; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Collections; + +import htsjdk.samtools.sra.SRAAccession; + +import java.lang.reflect.Method; + +public class SRAPrintReadsIntegrationTest extends WalkerTest { + + private static class PRTest { + final String accession; + final String args; + final String md5; + + private PRTest(String accession, String args, String md5) { + this.accession = accession; + this.args = args; + this.md5 = md5; + } + + @Override + public String toString() { + return String.format("PRTest(accession='%s', args='%s')", accession, args); + } + } + + private static boolean canResolveNetworkAccession = false; + private static String checkAccession = "SRR000123"; + private static final String REMOTE_ACCESSION_PATTERN = "^[SED]RR[0-9]{6,9}$"; + + /** + * Are the SRA native libraries loaded and initialized? Does the test accession have a valid name? + */ + @BeforeGroups(groups = {"sra"}) + public final void checkIfCanResolve() { + // Did SRA successfully load the native libraries and are fully initialized? + if (!SRAAccession.isSupported()) { + return; + } + + // Is this is a valid SRA accession? + canResolveNetworkAccession = SRAAccession.isValid(checkAccession); + } + + /** + * Are the SRA native libraries loaded and initialized? + * + * @throws SkipException if the SRA native libraries are loaded and initialized + */ + @BeforeMethod + public final void assertSRAIsSupported(){ + if(!SRAAccession.isSupported()){ + throw new SkipException("Skipping SRA Test because SRA native code is unavailable."); + } + } + + /** + * Skip network SRA Test because cannot resolve remote SRA accession + * + * @param method Provides information about, and access to, a single method on a class or interface + * @param params Method parameters + * @throws SkipException if cannot resold an SRA accession + */ + @BeforeMethod + public void skipIfCantResolve(Method method, Object[] params) { + String accession = null; + + if (params.length > 0) { + Object firstParam = params[0]; + if (firstParam instanceof PRTest) { + accession = ((PRTest)firstParam).accession; + } + } + + if (accession != null && + accession.matches(REMOTE_ACCESSION_PATTERN) && !canResolveNetworkAccession) { + throw new SkipException("Skipping network SRA Test because cannot resolve remote SRA accession '" + + checkAccession + "'."); + } + } + + @DataProvider(name = "PRTest") + public Object[][] createPrintReadsTestData() { + return new Object[][]{ + // Test with local SRA accessions + {new PRTest(privateTestDir + "ERR1214757.sra", "", "173ed87acc794a704aa000c6ab5d63a8")}, + {new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")}, + {new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")}, + // Tests with remote SRA accessions + {new PRTest("ERR1214757", "", "173ed87acc794a704aa000c6ab5d63a8")}, + {new PRTest("ERR1214757", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")}, + {new PRTest("ERR1214757", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")}, + }; + } + + @Test(groups = "sra", dataProvider = "PRTest") + public void testPrintReads(PRTest params) { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T PrintReads" + + " -R " + params.accession + + " -I " + params.accession + + params.args + + " --no_pg_tag" + + " -o %s", + Collections.singletonList(params.md5)); + executeTest("testPrintReads-"+params.args, spec).getFirst(); + } + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java index 1fc67d482..e83628e7c 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils.activeregion; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocSortedSet; @@ -168,7 +168,7 @@ public class ActiveRegion implements HasGenomeLocation { /** * See #getActiveRegionReference but with padding == 0 */ - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } @@ -182,21 +182,21 @@ public class ActiveRegion implements HasGenomeLocation { * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader, final int padding ) { return getReference(referenceReader, padding, extendedLoc); } /** * See #getActiveRegionReference but using the span including regions not the extended span */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + public byte[] getFullReference( final ReferenceSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } /** * See #getActiveRegionReference but using the span including regions not the extended span */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getFullReference( final ReferenceSequenceFile referenceReader, final int padding ) { return getReference(referenceReader, padding, spanIncludingReads); } @@ -211,7 +211,7 @@ public class ActiveRegion implements HasGenomeLocation { * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + public byte[] getReference( final ReferenceSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java index b56305d86..e6facd081 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java @@ -25,12 +25,12 @@ package org.broadinstitute.gatk.utils.baq; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; @@ -151,7 +151,7 @@ public class BAQ { * @param b band width * @param minBaseQual All bases with Q < minBaseQual are up'd to this value */ - public BAQ(final double d, final double e, final int b, final byte minBaseQual, boolean includeClippedBases) { + public BAQ(final double d, final double e, final int b, final byte minBaseQual, final boolean includeClippedBases) { cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual; this.includeClippedBases = includeClippedBases; @@ -521,7 +521,7 @@ public class BAQ { } } - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { + public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final ReferenceSequenceFile refReader) { // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar int offset = getBandWidth() / 2; long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart(); @@ -540,7 +540,7 @@ public class BAQ { // final SimpleTimer total = new SimpleTimer(); // final SimpleTimer local = new SimpleTimer(); // int n = 0; - public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) { + public BAQCalculationResult calcBAQFromHMM(final byte[] ref, final byte[] query, final byte[] quals, final int queryStart, final int queryEnd ) { // total.restart(); if ( queryStart < 0 ) throw new ReviewedGATKException("BUG: queryStart < 0: " + queryStart); if ( queryEnd < 0 ) throw new ReviewedGATKException("BUG: queryEnd < 0: " + queryEnd); @@ -564,7 +564,7 @@ public class BAQ { * @param read * @return */ - private final Pair calculateQueryRange(SAMRecord read) { + private final Pair calculateQueryRange(final SAMRecord read) { int queryStart = -1, queryStop = -1; int readI = 0; @@ -599,7 +599,7 @@ public class BAQ { } // we need to pad ref by at least the bandwidth / 2 on either side - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { + public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final byte[] ref, final int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read Pair queryRange = calculateQueryRange(read); if ( queryRange == null ) return null; // read has Ns, or is completely clipped away @@ -642,7 +642,7 @@ public class BAQ { return baqResult; } - public byte capBaseByBAQ( byte oq, byte bq, int state, int expectedPos ) { + public byte capBaseByBAQ( final byte oq, final byte bq, final int state, final int expectedPos ) { byte b; boolean isIndel = stateIsIndel(state); int pos = stateAlignedPosition(state); @@ -664,7 +664,7 @@ public class BAQ { * @param calculationType * @return BQ qualities for use, in case qmode is DONT_MODIFY */ - public byte[] baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, CalculationMode calculationType, QualityMode qmode ) { + public byte[] baqRead(final SAMRecord read, final ReferenceSequenceFile refReader, final CalculationMode calculationType, final QualityMode qmode ) { if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); byte[] BAQQuals = read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to them @@ -706,7 +706,7 @@ public class BAQ { * @param read * @return */ - public boolean excludeReadFromBAQ(SAMRecord read) { + public boolean excludeReadFromBAQ(final SAMRecord read) { // keeping mapped reads, regardless of pairing status, or primary alignment status. return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag(); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java index a193eebf4..4a09f8393 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -25,6 +25,9 @@ package org.broadinstitute.gatk.utils.fasta; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.sra.SRAAccession; +import htsjdk.samtools.sra.SRAIndexedSequenceFile; import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.samtools.SAMException; import htsjdk.samtools.reference.FastaSequenceIndex; @@ -192,8 +195,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * Possibly may be better as an overloaded constructor. * @param fastaFile Fasta file to be used as reference * @return A new instance of a CachingIndexedFastaSequenceFile. + * @throws IllegalArgumentException if Fasta file is null */ - public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) { + public static ReferenceSequenceFile checkAndCreate(final File fastaFile) { + if ( fastaFile == null ) { + throw new IllegalArgumentException("Fasta file is null"); + } + // maybe it is SRA file? + if (SRAAccession.isValid(fastaFile.getPath())) { + return new SRAIndexedSequenceFile(new SRAAccession(fastaFile.getPath())); + } + // does the fasta file exist? check that first... if (!fastaFile.exists()) throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java index 55a4efe64..5bb518a50 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java @@ -25,10 +25,10 @@ package org.broadinstitute.gatk.utils.locusiterator; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; @@ -62,7 +62,7 @@ public class LIBSPerformance extends CommandLineProgram { @Override public int execute() throws IOException { - final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); final GenomeLocParser genomeLocParser = new GenomeLocParser(reference); final SAMFileReader reader = new SAMFileReader(samFile); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java index b69794ed9..1b63e6132 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.utils.sam; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.NGSPlatform; @@ -52,7 +52,7 @@ import java.util.*; public class ArtificialBAMBuilder { public final static int BAM_SHARD_SIZE = 16384; - private final IndexedFastaSequenceFile reference; + private final ReferenceSequenceFile reference; private final GenomeLocParser parser; final int nReadsPerLocus; @@ -73,7 +73,7 @@ public class ArtificialBAMBuilder { SAMFileHeader header; - public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) { + public ArtificialBAMBuilder(final ReferenceSequenceFile reference, int nReadsPerLocus, int nLoci) { this.nReadsPerLocus = nReadsPerLocus; this.nLoci = nLoci; @@ -94,7 +94,7 @@ public class ArtificialBAMBuilder { createAndSetHeader(1); } - public IndexedFastaSequenceFile getReference() { + public ReferenceSequenceFile getReference() { return reference; } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java index 75a56b110..d62c1681b 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.utils.text; +import htsjdk.samtools.sra.SRAAccession; import org.broadinstitute.gatk.utils.commandline.ParsingEngine; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -59,6 +60,13 @@ public class ListFileUtils { * @return a flattened list of the bam files provided */ public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { + if ( samFiles == null ) { + throw new IllegalArgumentException("The SAM files are null"); + } + if ( parser == null ) { + throw new IllegalArgumentException("The parser is null"); + } + List unpackedReads = new ArrayList(); for( String inputFileName: samFiles ) { Tags inputFileNameTags = parser.getTags(inputFileName); @@ -79,10 +87,14 @@ public class ListFileUtils { else if(inputFileName.endsWith("stdin")) { unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); } + else if(SRAAccession.isValid(inputFileName)) { + unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); + } else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension and lists of BAM/CRAM files " + - "with the .list extension, but the file %s has neither extension. Please ensure that your BAM/CRAM file or list " + - "of BAM/CRAM files is in the correct format, update the extension, and try again.",inputFileName)); + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension " + "" + + "or SRA archives and lists of BAM/CRAM/SRA files with the .list extension, but the file %s has " + + "neither extension and is not SRA accession. Please ensure that your BAM/CRAM file or list " + + "of BAM/CRAM files is in the correct format, update the extension, and try again.", inputFileName)); } } return unpackedReads; diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java index 186f39f84..637539395 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java @@ -29,10 +29,10 @@ package org.broadinstitute.gatk.utils; // the imports for unit testing. -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -57,7 +57,7 @@ public class ExampleToCopyUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java index 43af1502c..67badf071 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; @@ -36,7 +36,7 @@ import java.io.File; * Caliper microbenchmark of genome loc parser */ public class GenomeLocParserBenchmark extends SimpleBenchmark { - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private final int ITERATIONS = 1000000; @Param({"NEW", "NONE"}) diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java index 1f8b1f49c..a06c86ba6 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java @@ -28,10 +28,9 @@ package org.broadinstitute.gatk.utils; // the imports for unit testing. - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; @@ -53,7 +52,7 @@ public class NGSPlatformUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java index 4a0b38215..1728f2170 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java @@ -29,8 +29,8 @@ package org.broadinstitute.gatk.utils.activeregion; // the imports for unit testing. -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -51,7 +51,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private String contig; private int contigLength; diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java index 0f065eef6..433637927 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.utils.baq; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; @@ -52,7 +53,7 @@ public class BAQUnitTest extends BaseTest { private final int startChr = 1; private final int numChr = 2; private final int chrSize = 1000; - IndexedFastaSequenceFile fasta = null; + ReferenceSequenceFile fasta = null; @BeforeMethod public void before() { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index d55c3ad59..a8d6fd0db 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -32,6 +32,7 @@ package org.broadinstitute.gatk.utils.fasta; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Priority; import org.broadinstitute.gatk.utils.BaseTest; @@ -93,7 +94,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers"); - final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { @@ -123,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { // Tests grabbing sequences around a middle cached value. @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta); final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -191,7 +192,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { // make sure some bases are lower case and some are upper case @Test(enabled = true) public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException { - final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); + final ReferenceSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true); final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA)); @@ -208,9 +209,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state"); } - private int testCases(final IndexedFastaSequenceFile original, - final IndexedFastaSequenceFile casePreserving, - final IndexedFastaSequenceFile allUpper, + private int testCases(final ReferenceSequenceFile original, + final ReferenceSequenceFile casePreserving, + final ReferenceSequenceFile allUpper, final String contig, final int start, final int stop ) { final String orig = fetchBaseString(original, contig, start, stop); final String keptCase = fetchBaseString(casePreserving, contig, start, stop); @@ -226,7 +227,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } } - private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) { + private String fetchBaseString(final ReferenceSequenceFile reader, final String contig, final int start, final int stop) { if ( start == -1 ) return new String(reader.getSequence(contig).getBases()); else diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java index de14e0f43..18c18a917 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java @@ -25,8 +25,7 @@ package org.broadinstitute.gatk.utils.refdata.tracks; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import org.broadinstitute.gatk.utils.BaseTest; @@ -66,7 +65,7 @@ public class FeatureManagerUnitTest extends BaseTest { public void setup() { File referenceFile = new File(b36KGReference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); manager = new FeatureManager(); } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java index 3131f92d6..e904f58ed 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.utils.refdata.tracks; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Tribble; import htsjdk.tribble.index.Index; import htsjdk.tribble.util.LittleEndianOutputStream; @@ -54,7 +54,7 @@ import java.nio.channels.FileChannel; */ public class RMDTrackBuilderUnitTest extends BaseTest { private RMDTrackBuilder builder; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; @BeforeMethod diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java index da12cf506..17a66c617 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.utils.refdata.utils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -43,7 +43,7 @@ public class FeatureToGATKFeatureIteratorUnitTest extends BaseTest { @SuppressWarnings("unchecked") public void testCloseFilePointers() throws IOException { final String chr = "20"; - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); GenomeLocParser parser = new GenomeLocParser(seq); File file = new File(privateTestDir + "NA12878.hg19.example1.vcf"); VCFCodec codec = new VCFCodec(); diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java index 372a711b0..9a9a540a4 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.Utils; @@ -223,7 +223,7 @@ public class ReadUtilsUnitTest extends BaseTest { @Test (enabled = true) public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76; @@ -239,7 +239,7 @@ public class ReadUtilsUnitTest extends BaseTest { @Test (enabled = true) public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76;