From 6d855041ec533f3385874637b1bb56cc72f9218d Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 12 Jan 2011 21:54:51 +0000 Subject: [PATCH] Oops...forgot to commit the changes that allow primitive VCF streaming. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4979 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/CommandLineExecutable.java | 9 +++++- .../arguments/GATKArgumentCollection.java | 10 ++++++ .../ReferenceOrderedDataSource.java | 13 ++++---- .../gatk/refdata/SeekableRODIterator.java | 32 +++++++++++++++++-- .../gatk/refdata/utils/FlashBackIterator.java | 20 ++++++++++++ .../LocationAwareSeekableRODIterator.java | 5 +++ .../coverage/DepthOfCoverageWalker.java | 2 +- .../indels/IndelGenotyperV2Walker.java | 4 ++- .../walkers/sequenom/PickSequenomProbes.java | 11 +++++-- .../walkers/IndelAnnotator.java | 7 ++-- ...ReadBasedReferenceOrderedViewUnitTest.java | 20 ++++++++++++ .../utils/FlashBackIteratorUnitTest.java | 20 ++++++++++++ 12 files changed, 136 insertions(+), 17 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 20692c68a..8cc0fefa8 100644 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -243,7 +243,14 @@ public abstract class CommandLineExecutable extends CommandLineProgram { for (String fileName: argCollection.RODBindings) { List parameters = parser.getTags(fileName); fileName = expandFileName(fileName); - RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDStorageType.STREAM : RMDStorageType.FILE; + + RMDStorageType storageType = null; + if(argCollection.rodInputType != null) + storageType = argCollection.rodInputType; + else if(fileName.toLowerCase().endsWith("stdin")) + storageType = RMDStorageType.STREAM; + else + storageType = RMDStorageType.FILE; if(parameters.size() != 2) throw new UserException("Invalid syntax for -B (reference-ordered data) input flag. " + diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 7d4716276..0365765d1 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.arguments; import net.sf.samtools.SAMFileReader; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.commandline.Argument; @@ -210,6 +212,11 @@ public class GATKArgumentCollection { @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; + @Element(required=false) + @Argument(fullName="rod_input_type",shortName="rit",doc="Indicates whether to use a file approach or a streaming approach to loading ROD data",required=false) + @Hidden + public RMDTriplet.RMDStorageType rodInputType = null; + /** * marshal the data out to a object * @@ -371,6 +378,9 @@ public class GATKArgumentCollection { (other.performanceLog != null && !other.performanceLog.equals(this.performanceLog))) return false; + if(rodInputType != other.rodInputType) + return false; + return true; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index 88f6ae024..36842124c 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -202,10 +202,8 @@ class ReferenceOrderedDataPool extends ResourcePool 0) throw new ReviewedStingException("BUG: Tried to create multiple iterators over streaming ROD interface"); - LocationAwareSeekableRODIterator iter = new SeekableRODIterator(referenceSequenceDictionary,genomeLocParser,builder.createInstanceOfTrack(fileDescriptor).getIterator()); + RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); + LocationAwareSeekableRODIterator iter = new SeekableRODIterator(track.getHeader(),track.getSequenceDictionary(),referenceSequenceDictionary,genomeLocParser,track.getIterator()); return (flashbackData) ? new FlashBackIterator(iter) : iter; } @@ -344,9 +343,9 @@ class ReferenceOrderedQueryDataPool extends ResourcePool it; List records = null; // here we will keep a pile of records overlaping with current position; when we iterate // and step out of record's scope, we purge it from the list @@ -86,8 +93,10 @@ public class SeekableRODIterator implements LocationAwareSeekableRODIterator { // This implementation tracks the query history and makes next() illegal after a seekforward query of length > 1, // but re-enables next() again after a length-1 query. - public SeekableRODIterator(SAMSequenceDictionary dictionary,GenomeLocParser parser,CloseableIterator it) { + public SeekableRODIterator(Object header,SAMSequenceDictionary rodDictionary,SAMSequenceDictionary referenceDictionary,GenomeLocParser parser,CloseableIterator it) { + this.header = header; this.parser = parser; + this.sequenceDictionary = rodDictionary; this.it = new PushbackIterator(it); records = new LinkedList(); // the following is a trick: we would like the iterator to know the actual name assigned to @@ -99,9 +108,28 @@ public class SeekableRODIterator implements LocationAwareSeekableRODIterator { if (this.it.hasNext()) r = this.it.element(); name = (r==null?null:r.getName()); - curr_contig = dictionary.getSequence(0).getSequenceName(); + curr_contig = referenceDictionary.getSequence(0).getSequenceName(); } + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return header; + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + /** * Returns true if the data we iterate over has records associated with (any, not necessarily adjacent) * genomic position farther along the reference. diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java index 85a03a73d..dc6da7132 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java @@ -23,6 +23,7 @@ package org.broadinstitute.sting.gatk.refdata.utils; +import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.Comparator; @@ -56,6 +57,25 @@ public class FlashBackIterator implements LocationAwareSeekableRODIterator { this.iterator = iterator; } + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return iterator.getHeader(); + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return iterator.getSequenceDictionary(); + } + + /** * peek at the next location * @return diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java index 29e42eab6..83aa5f056 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.refdata.utils; +import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; @@ -15,6 +16,10 @@ import java.util.List; * combine iteration with a position aware interface */ public interface LocationAwareSeekableRODIterator extends CloseableIterator { + public Object getHeader(); + + public SAMSequenceDictionary getSequenceDictionary(); + public GenomeLoc peekNextLocation(); public GenomeLoc position(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 972dbe511..d35eaea2c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -406,7 +406,7 @@ public class DepthOfCoverageWalker extends LocusWalker { getToolkit().getArguments().unsafe); RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - refseqIterator = new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + refseqIterator = new SeekableRODIterator(refseq.getHeader(), + refseq.getSequenceDictionary(), + getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), getToolkit().getGenomeLocParser(), refseq.getIterator()); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java index af6c56aa3..81c45206b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java @@ -32,6 +32,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; @@ -87,9 +88,13 @@ public class PickSequenomProbes extends RodWalker { ReferenceOrderedData snp_mask; if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) { RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); - CloseableIterator iter = builder.createInstanceOfTrack(DbSNPCodec.class,new java.io.File(SNP_MASK)).getIterator(); - snpMaskIterator = new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),iter); - + RMDTrack track = builder.createInstanceOfTrack(DbSNPCodec.class,new java.io.File(SNP_MASK)); + snpMaskIterator = new SeekableRODIterator(track.getHeader(), + track.getSequenceDictionary(), + getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + track.getIterator()); + } else { // TODO: fix me when Plink is back throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)"); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java index 6dc1a4443..13499214e 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java @@ -38,8 +38,11 @@ public class IndelAnnotator extends RodWalker { getToolkit().getArguments().unsafe); RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - refseqIterator = new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(),refseq.getIterator()); + refseqIterator = new SeekableRODIterator(refseq.getHeader(), + refseq.getSequenceDictionary(), + getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + refseq.getIterator()); logger.info("Using RefSeq annotations from " + RefseqFileName); } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 9c84c3e18..41bdda0e0 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; @@ -109,6 +110,25 @@ class FakePeekingRODIterator implements LocationAwareSeekableRODIterator { this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); } + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return null; + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return null; + } + + @Override public GenomeLoc peekNextLocation() { System.err.println("Peek Next -> " + location); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java index d9b30cdf3..6efd460cf 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.refdata.utils; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; @@ -157,6 +158,25 @@ class FakeSeekableRODIterator implements LocationAwareSeekableRODIterator { this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); } + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return null; + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return null; + } + + @Override public GenomeLoc peekNextLocation() { System.err.println("Peek Next -> " + location);