diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index 31bcf1083..4138e8b57 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -134,6 +134,11 @@ public class GATKArgumentCollection { @Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public int numberOfThreads = 1; + @Element(required = false) + @Argument(fullName = "LocusIteratorByState", shortName = "LIBS", doc = "Should we use the new LocusIteratorByState or the old LocusIteratorByHanger?", required = false) + public Boolean useLocusIteratorByState = false; + + /** * marshal the data out to a object * @@ -272,6 +277,10 @@ public class GATKArgumentCollection { if (other.numberOfThreads != this.numberOfThreads) { return false; } + if (other.useLocusIteratorByState != this.useLocusIteratorByState) { + return false; + } + return true; } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index 612d03e76..5ce839a41 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -4,10 +4,12 @@ import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; import java.util.Arrays; @@ -59,7 +61,10 @@ public abstract class LocusView extends LocusIterator implements View { Iterator reads = new FilteringIterator(provider.getReadIterator(), new LocusStreamFilterFunc()); this.sourceInfo = provider.getReadIterator().getSourceInfo(); - this.loci = new LocusIteratorByHanger(reads, sourceInfo); + if (GenomeAnalysisEngine.instance.getArguments().useLocusIteratorByState) + this.loci = new LocusIteratorByState(reads, sourceInfo); + else + this.loci = new LocusIteratorByHanger(reads, sourceInfo); seedNextLocus(); provider.register(this); @@ -144,6 +149,7 @@ public abstract class LocusView extends LocusIterator implements View { * Seed the nextLocus variable with the contents of the next locus (if one exists). */ private void seedNextLocus() { + //System.out.printf("loci is %s%n", loci); if( loci.hasNext() ) nextLocus = loci.next(); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java new file mode 100755 index 000000000..9d6903baa --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.RefHanger; +import org.broadinstitute.sting.utils.Utils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByState extends LocusIterator { + + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + private final PushbackIterator it; + + private class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + + Cigar cigar = null; + int cigarOffset = -1; + CigarElement curElement = null; + int nCigarElements = 0; + + // how far are we into a single cigarElement + int cigarElementCounter = -1; + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigar = read.getCigar(); + nCigarElements = cigar.numCigarElements(); + + //System.out.printf("Creating a SAMRecordState: %s%n", this); + } + + public SAMRecord getRead() { return read; } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + */ + public int getReadOffset() { return readOffset; } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + */ + public int getGenomeOffset() { return genomeOffset; } + + public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); } + + public GenomeLoc getLocation() { + return GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), getGenomePosition()); + } + + //private CigarElement getCurElement() { return curElement; } + + public CigarOperator getCurrentCigarOperator() { + //if ( curElement == null ) + // System.out.printf("%s%n", this); + return curElement.getOperator(); + } + + public String toString() { + return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); + } + + public CigarOperator stepForwardOnGenome() { + //if ( cigarOffset == cigar.numCigarElements() ) + // return null; // we are done + + if (DEBUG2) System.out.printf("stepForwardOnGenome2: curElement=%s, counter=%d, len=%d%n", curElement, cigarElementCounter, curElement != null ? curElement.getLength() : -1); + if ( curElement == null || ++cigarElementCounter >= curElement.getLength() ) { + cigarOffset++; + if ( cigarOffset < nCigarElements ) { + curElement = cigar.getCigarElement(cigarOffset); + cigarElementCounter = 0; + } else { + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H : // ignore hard clips + case P : // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case S : // soft clip + case I : // insertion w.r.t. the reference +// readOffset++; done = true; break; + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case N : // reference skip + cigarElementCounter = curElement.getLength(); + genomeOffset += curElement.getLength(); + break; + case D : // deletion w.r.t. the reference + genomeOffset++; + done = true; + break; + case M : + readOffset++; + genomeOffset++; + done = true; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + if (DEBUG2) System.out.printf("stepForwardOnGenome3: done=%b curElement=%s, counter=%d, len=%d, offset=%d%n", + done, curElement, cigarElementCounter, curElement != null ? curElement.getLength() : -1, getReadOffset()); + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + } + +/* private class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + Iterator cigarIt = null; + CigarElement curElement = null; + int cigarElementCounter = 0; + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigarIt = read.getCigar().getCigarElements().iterator(); + //stepForwardOnGenome(); // todo -- may be a performance issue -- can we avoid this? + } + + public SAMRecord getRead() { return read; } + + *//** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + *//* + public int getReadOffset() { return readOffset; } + + *//** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + *//* + public int getGenomeOffset() { return genomeOffset; } + + public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); } + + public GenomeLoc getLocation() { + return GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), getGenomePosition()); + } + + public CigarOperator getCurrentCigarOperator() { return curElement.getOperator(); } + + public CigarOperator stepForwardOnGenome() { + if (DEBUG2) System.out.printf("stepForwardOnGenome1: cigarIt=%s%n", cigarIt); + if (cigarIt == null && ! cigarIt.hasNext()) + return null; // we are done + + if (DEBUG2) System.out.printf("stepForwardOnGenome2: curElement=%s, counter=%d, len=%d%n", curElement, cigarElementCounter, curElement != null ? curElement.getLength() : -1); + if ( curElement == null || ++cigarElementCounter >= curElement.getLength() ) { + if ( cigarIt.hasNext() ) { + curElement = cigarIt.next(); + cigarElementCounter = 0; + } else { + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H : // ignore hard clips + case P : // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case S : // soft clip + case I : // insertion w.r.t. the reference +// readOffset++; done = true; break; + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case N : // reference skip + cigarElementCounter = curElement.getLength(); + genomeOffset += curElement.getLength(); + break; + case D : // deletion w.r.t. the reference + genomeOffset++; + done = true; + break; + case M : + readOffset++; + genomeOffset++; + done = true; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + if (DEBUG2) System.out.printf("stepForwardOnGenome3: done=%b curElement=%s, counter=%d, len=%d, offset=%d%n", + done, curElement, cigarElementCounter, curElement != null ? curElement.getLength() : -1, getReadOffset()); + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + }*/ + + private LinkedList readStates = new LinkedList(); + final boolean DEBUG = false; + final boolean DEBUG2 = false && DEBUG; + private Reads readInfo; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + public LocusIteratorByState(final Iterator samIterator, Reads readInformation) { + this.it = new PushbackIterator(samIterator); + this.readInfo = readInformation; + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + boolean r = ! readStates.isEmpty() || it.hasNext(); + if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + return r; + } + + public void printState() { + for ( SAMRecordState state : readStates ) { + logger.debug(String.format("printState():")); + SAMRecord read = state.getRead(); + int offset = state.getReadOffset(); + logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, read.getReadString().charAt(offset), read.getCigarString())); + } + } + + public void clear() { + logger.debug(String.format(("clear() called"))); + readStates.clear(); + } + + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public AlignmentContext next() { + if (DEBUG) { + logger.debug("in Next:"); + printState(); + } + + collectPendingReads(readInfo.getMaxReadsAtLocus()); + + // todo -- performance problem -- should be lazy, really + LinkedList reads = new LinkedList(); + LinkedList offsets = new LinkedList(); + for ( SAMRecordState state : readStates ) { + // todo -- enable D operation in alignment contexts + if ( state.getCurrentCigarOperator() != CigarOperator.D ) { + reads.add(state.getRead()); + offsets.add(state.getReadOffset()); + } + } + GenomeLoc loc = getLocation(); + + updateReadStates(); // critical - must be called after we get the current state offsets and location + + if (DEBUG) { + logger.debug("DONE WITH NEXT, updating read states, current state is:"); + printState(); + } + + return new AlignmentContext(loc, reads, offsets); + } + + private void collectPendingReads(final int maximumPileupSize) { + if (DEBUG) { + logger.debug(String.format("entering collectPendingReads..., hasNext=%b", it.hasNext())); + printState(); + } + + Boolean warned = false; // warn them once per locus + int curSize = readStates.size(); // simple performance improvement -- avoids unnecessary size() operation + while (it.hasNext()) { + SAMRecord read = it.next(); + + if (DEBUG) logger.debug(String.format("Considering read %s: %s vs. %s", + read.getReadName(), getLocation(), GenomeLocParser.createGenomeLoc(read))); + + //GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); + //if (getLocation() != null && readLoc.distance(getLocation()) >= 1) { + if ( readIsPastCurrentPosition(read) ) { + // We've collected up enough reads + it.pushback(read); + break; + } else { + if ( curSize >= maximumPileupSize ) { + if (!warned) { + warned = true; + Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maximumPileupSize + " at location " + getLocation()); + } + } else { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + readStates.add(state); + curSize++; + if (DEBUG) logger.debug(String.format(" ... added read %s", read.getReadName())); + } + } + } + } + + // fast testing of position + private boolean readIsPastCurrentPosition(SAMRecord read) { + if ( readStates.isEmpty() ) + return false; + else { + SAMRecordState state = readStates.getFirst(); + SAMRecord ourRead = state.getRead(); + //System.out.printf("read=%d vs. us=%d%n", read.getAlignmentStart(), ourRead.getAlignmentStart()); + return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + } + } + + + private void updateReadStates() { + Iterator it = readStates.iterator(); + while ( it.hasNext() ) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if ( op == null ) { // we've stepped off the end of the object + if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); + it.remove(); + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java index f038ee4fd..8af785011 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java @@ -49,11 +49,18 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends GenotypeLikelihood } } + private static SAMRecord lastReadForPL = null; + private static SequencerPlatform plOfLastRead = null; public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) { - final String readGroupString = ((String)read.getAttribute("RG")); - SAMReadGroupRecord readGroup = readGroupString == null ? null : read.getHeader().getReadGroup(readGroupString); - final String platformName = readGroup == null ? null : (String)readGroup.getAttribute(SAM_PLATFORM_TAG); - return standardizeSequencerPlatform(platformName); + if ( lastReadForPL != read ) { + lastReadForPL = read; + final String readGroupString = ((String)read.getAttribute("RG")); + SAMReadGroupRecord readGroup = readGroupString == null ? null : read.getHeader().getReadGroup(readGroupString); + final String platformName = readGroup == null ? null : (String)readGroup.getAttribute(SAM_PLATFORM_TAG); + plOfLastRead = standardizeSequencerPlatform(platformName); + } + + return plOfLastRead; } // -------------------------------------------------------------------------------------------------------------- @@ -208,6 +215,10 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends GenotypeLikelihood return super.clone(); } + protected boolean cacheByTech() { + return true; + } + protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) { boolean fwdStrand = ! read.getReadNegativeStrandFlag(); SequencerPlatform pl = getReadSequencerPlatform(read); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 3e3baf55b..3155412eb 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -167,6 +167,28 @@ public abstract class GenotypeLikelihoods implements Cloneable { return getPriors()[g.ordinal()]; } + /** + * Simple function to overload to control the caching of genotype likelihood outputs. + * By default the function trues true -- do enable caching. If you are experimenting with an + * complex calcluation of P(B | G) and caching might not work correctly for you, overload this + * function and return false, so the super() object won't try to cache your GL calculations. + * + * @return true if caching should be enabled, false otherwise + */ + protected boolean enableCache() { + return true; + } + + /** + * Does the caller need to know the tech of the read? If true, we will use the declared tech of the + * read for caching, which can be painfully slow. By default we aren't tech aware. + * + * @return true + */ + protected boolean cacheByTech() { + return false; + } + // ----------------------------------------------------------------------------------------------------------------- // // @@ -216,7 +238,7 @@ public abstract class GenotypeLikelihoods implements Cloneable { } - private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) { + private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCacheArg ) { if ( badBase(observedBase) ) { throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore)); } @@ -225,6 +247,7 @@ public abstract class GenotypeLikelihoods implements Cloneable { if ( qualityScore > getMinQScoreToInclude() ) { // Handle caching if requested. Just look up the cached result if its available, or compute and store it + boolean enableCache = enableCacheArg && enableCache(); GenotypeLikelihoods cached = null; if ( enableCache ) { if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) { @@ -281,7 +304,8 @@ public abstract class GenotypeLikelihoods implements Cloneable { private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, GenotypeLikelihoods val ) { - int a = EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read).ordinal(); + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform pl = cacheByTech() ? EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read) : EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.UNKNOWN; + int a = pl.ordinal(); int i = BaseUtils.simpleBaseToBaseIndex(observedBase); int j = qualityScore; int k = ploidy; diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py index 5cd3796e1..ee3514635 100755 --- a/python/gatkConfigParser.py +++ b/python/gatkConfigParser.py @@ -27,6 +27,7 @@ class gatkConfigParser(ConfigParser.SafeConfigParser): print 'Reading configuration file(s):', files self.read(files) self.validateRequiredOptions() + self.moreArgs = "" def validateRequiredOptions(self): for key, value in defaultRequiredOptions.iteritems(): @@ -71,9 +72,12 @@ class gatkConfigParser(ConfigParser.SafeConfigParser): def gatk_args(self): return self.getOption(self.GATK, 'args') def reference(self): return self.getOption(self.GATK, 'reference') + def setMoreArgs(self, s): + self.moreArgs = s + def gatkCmd(self, mode, log = None, stdLogName=False): cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()]) - cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)]) + cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)]) + ' ' + self.moreArgs if log <> None: if stdLogName: #head, ext = os.path.splitext(log)