From 04befb942ec245eaa167a98ed86631e4cb50bca1 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 12 Mar 2009 23:30:19 +0000 Subject: [PATCH] Added support for HangingLocusIterator git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@42 348d0f76-0448-11de-a6fe-93d51630548a --- .../broadinstitute/sting/atk/AnalysisTK.java | 1 + .../sting/atk/LocusContext.java | 23 ++-- .../sting/atk/LocusIterator.java | 119 +----------------- .../broadinstitute/sting/atk/LocusWalker.java | 4 +- .../sting/atk/TraversalEngine.java | 11 +- .../sting/atk/modules/BasicLociWalker.java | 5 +- .../sting/atk/modules/CountLociWalker.java | 4 +- .../sting/atk/modules/GenotypeWalker.java | 3 +- .../sting/atk/modules/NullWalker.java | 5 +- .../sting/atk/modules/PileupWalker.java | 5 +- .../atk/modules/SingleSampleGenotyper.java | 5 +- .../broadinstitute/sting/utils/GenomeLoc.java | 52 +++++++- .../org/broadinstitute/sting/utils/Utils.java | 27 ++-- 13 files changed, 103 insertions(+), 161 deletions(-) diff --git a/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java b/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java index c2c8f2dfb..6b58ed926 100644 --- a/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java +++ b/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java @@ -40,6 +40,7 @@ public class AnalysisTK extends CommandLineProgram { addModule("Genotype", new GenotypeWalker()); addModule("SingleSampleGenotyper", new SingleSampleGenotyper()); addModule("Null", new NullWalker()); + addModule("DepthOfCoverage", new DepthOfCoverageWalker()); } private TraversalEngine engine = null; diff --git a/playground/java/src/org/broadinstitute/sting/atk/LocusContext.java b/playground/java/src/org/broadinstitute/sting/atk/LocusContext.java index 2a5552e04..d08339cad 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/LocusContext.java +++ b/playground/java/src/org/broadinstitute/sting/atk/LocusContext.java @@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord; import java.util.List; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created by IntelliJ IDEA. * User: mdepristo @@ -11,18 +13,17 @@ import java.util.List; * Time: 3:01:34 PM * To change this template use File | Settings | File Templates. */ -public class LocusContext { - public LocusContext() { }; - - // How big is the current context? - public int getLength() { return 1; } - - // get the reference base at the current (relative) position - public byte getReferenceBase() { return 0; } - +public interface LocusContext { // get all of the reads within this context - public List getReads() { return null; } + public List getReads(); // get a list of the equivalent positions within in the reads at Pos - public List getOffsets() { return null; } + public List getOffsets(); + + + public String getContig(); + public long getPosition(); + public GenomeLoc getLocation(); + + public int numReads(); } diff --git a/playground/java/src/org/broadinstitute/sting/atk/LocusIterator.java b/playground/java/src/org/broadinstitute/sting/atk/LocusIterator.java index 433b7c1c4..7e4a3b29f 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/LocusIterator.java +++ b/playground/java/src/org/broadinstitute/sting/atk/LocusIterator.java @@ -14,37 +14,13 @@ import java.util.Iterator; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis */ -public class LocusIterator implements Iterable, CloseableIterator { - - // ----------------------------------------------------------------------------------------------------------------- - // - // member fields - // - // ----------------------------------------------------------------------------------------------------------------- - private final PushbackIterator it; - private String contig = null; - private int position = -1; - private List reads = new ArrayList(100); - private List offsets = new ArrayList(100); - - protected String getContig() { return contig; } - protected long getPosition() { return position; } - public GenomeLoc getLocation() { return new GenomeLoc(contig, position); } - - public List getReads() { return reads; } - public List getOffsets() { return offsets; } - public int numReads() { return reads.size(); } - +public abstract class LocusIterator implements Iterable, CloseableIterator { // ----------------------------------------------------------------------------------------------------------------- // // constructors and other basic operations // // ----------------------------------------------------------------------------------------------------------------- - public LocusIterator(final CloseableIterator samIterator) { - this.it = new PushbackIterator(samIterator); - } - - public Iterator iterator() { + public Iterator iterator() { return this; } @@ -52,95 +28,8 @@ public class LocusIterator implements Iterable, CloseableIterator //this.it.close(); } - public boolean hasNext() { - return it.hasNext(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // next() routine and associated collection operations - // - // ----------------------------------------------------------------------------------------------------------------- - public LocusIterator next() { - position += 1; - - if ( position != -1 ) { - cleanReads(); - expandReads(); - } - - if ( reads.isEmpty() ) { - // the window is empty, we need to jump to the first pos of the first read in the stream - SAMRecord read = it.next(); - pushRead(read); - contig = read.getReferenceName(); - position = read.getAlignmentStart() - 1; - return next(); - } - else { - // at this point, window contains all reads covering the pos, we need to return them - // and the offsets into each read for this loci - calcOffsetsOfWindow(position); - return this; - } - } - - private void pushRead(SAMRecord read) { - //System.out.printf(" -> Adding read %s %d-%d flags %s%n", read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), Utils.readFlagsAsString(read)); - reads.add(read); - } - - class KeepReadPFunc implements Predicate { - public boolean apply(SAMRecord read) { - return position >= read.getAlignmentStart() && - position < read.getAlignmentEnd() && - read.getReferenceName().equals(contig); // should be index for efficiency - } - } - Predicate KeepReadP = new LocusIterator.KeepReadPFunc(); - - private void calcOffsetsOfWindow(final int position) { - offsets.clear(); - for ( SAMRecord read : reads ) { -// def calcOffset( read ): -// offset = self.pos - read.start -// return offset -// -// offsets = map(calcOffset, self.window) - final int offset = position - read.getAlignmentStart(); - assert(offset < read.getReadLength() ); - offsets.add(offset); - //System.out.printf("offsets [%d] %s%n", read.getAlignmentStart(), offsets); - } - } - - private void cleanReads() { - // def keepReadP( read ): - // return read.chr == chr and pos >= read.start and pos <= read.end - // self.window = filter( keepReadP, self.window ) - reads = Utils.filter(KeepReadP, reads); - } - - private void expandReads() { -// for read in self.rs: -// #print 'read', read, pos -// if read.chr == chr and read.start <= pos and read.end >= pos: -// self.pushRead(read) -// else: -// self.rs.unget( read ) -// #self.rs = chain( [read], self.rs ) -// break - while ( it.hasNext() ) { - SAMRecord read = it.next(); - if ( KeepReadP.apply( read ) ) { - pushRead(read); - } - else { - it.pushback(read); - break; - } - } - } + public abstract boolean hasNext(); + public abstract LocusContext next(); public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); diff --git a/playground/java/src/org/broadinstitute/sting/atk/LocusWalker.java b/playground/java/src/org/broadinstitute/sting/atk/LocusWalker.java index 275e93da0..a360ff8b5 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/LocusWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/LocusWalker.java @@ -17,10 +17,10 @@ public interface LocusWalker { public String walkerType(); // Do we actually want to operate on the context? - boolean filter(List rodData, char ref, LocusIterator context); + boolean filter(List rodData, char ref, LocusContext context); // Map over the org.broadinstitute.sting.atk.LocusContext - MapType map(List rodData, char ref, LocusIterator context); + MapType map(List rodData, char ref, LocusContext context); // Given result of map function ReduceType reduceInit(); diff --git a/playground/java/src/org/broadinstitute/sting/atk/TraversalEngine.java b/playground/java/src/org/broadinstitute/sting/atk/TraversalEngine.java index b2d10ddf4..7ca448fe1 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/TraversalEngine.java +++ b/playground/java/src/org/broadinstitute/sting/atk/TraversalEngine.java @@ -383,12 +383,6 @@ public class TraversalEngine { result = true; why = "No alignment start"; } - else if ( rec.getCigar().numCigarElements() > 1 ) { - // FIXME -- deal with indels correctly! - nSkippedIndels++; - result = true; - why = "Skipping indel: " + rec.getCigarString(); - } else { result = false; } @@ -417,7 +411,8 @@ public class TraversalEngine { protected int traverseByLoci(LocusWalker walker) { // prepare the read filtering read iterator and provide it to a new locus iterator FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc()); - CloseableIterator iter = new LocusIterator(filterIter); + //LocusIterator iter = new SingleLocusIterator(filterIter); + LocusIterator iter = new LocusIteratorByHanger(filterIter); // Initial the reference ordered data iterators List rodIters = initializeRODs(); @@ -432,7 +427,7 @@ public class TraversalEngine { this.nRecords++; // actually get the read and hand it to the walker - final LocusIterator locus = iter.next(); + final LocusContext locus = iter.next(); // Poor man's version of index LOL if ( inLocations(locus.getLocation()) ) { diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/BasicLociWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/BasicLociWalker.java index 72910f2be..aff1efdae 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/BasicLociWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/BasicLociWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules; import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import net.sf.samtools.SAMRecord; @@ -22,7 +23,7 @@ public abstract class BasicLociWalker implements LocusWalke public String walkerType() { return "ByLocus"; } // Do we actually want to operate on the context? - public boolean filter(List rodData, char ref, LocusIterator context) { + public boolean filter(List rodData, char ref, LocusContext context) { return true; // We are keeping all the reads } @@ -30,7 +31,7 @@ public abstract class BasicLociWalker implements LocusWalke } // These three capabilities must be overidden - public abstract MapType map(List rodData, char ref, LocusIterator context); + public abstract MapType map(List rodData, char ref, LocusContext context); public abstract ReduceType reduceInit(); public abstract ReduceType reduce(MapType value, ReduceType sum); diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/CountLociWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/CountLociWalker.java index 372f453d1..af3a9d733 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/CountLociWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/CountLociWalker.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.atk.modules; -import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import java.util.List; @@ -13,7 +13,7 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class CountLociWalker extends BasicLociWalker { - public Integer map(List rodData, char ref, LocusIterator context) { + public Integer map(List rodData, char ref, LocusContext context) { return 1; } diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/GenotypeWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/GenotypeWalker.java index 5bff04810..69ca12c65 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/GenotypeWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/GenotypeWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules; import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.GenotypeEvidence; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import net.sf.samtools.SAMRecord; @@ -10,7 +11,7 @@ import java.util.List; import static java.lang.System.currentTimeMillis; public class GenotypeWalker extends BasicLociWalker { - public Integer map(List rodData, char ref, LocusIterator context) { + public Integer map(List rodData, char ref, LocusContext context) { //char[] = new char(26); long start_tm = currentTimeMillis(); List reads = context.getReads(); diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/NullWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/NullWalker.java index d6d3f91c8..16a810a95 100644 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/NullWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/NullWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules; import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.Utils; @@ -19,12 +20,12 @@ public class NullWalker implements LocusWalker { public String walkerType() { return "ByLocus"; } // Do we actually want to operate on the context? - public boolean filter(List rodData, char ref, LocusIterator context) { + public boolean filter(List rodData, char ref, LocusContext context) { return true; // We are keeping all the reads } // Map over the org.broadinstitute.sting.atk.LocusContext - public Integer map(List rodData, char ref, LocusIterator context) + public Integer map(List rodData, char ref, LocusContext context) { return 1; } diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/PileupWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/PileupWalker.java index ca01fc5ad..d44b08c79 100644 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/PileupWalker.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/PileupWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules; import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.Utils; @@ -23,12 +24,12 @@ public class PileupWalker implements LocusWalker { public String walkerType() { return "ByLocus"; } // Do we actually want to operate on the context? - public boolean filter(List rodData, char ref, LocusIterator context) { + public boolean filter(List rodData, char ref, LocusContext context) { return true; // We are keeping all the reads } // Map over the org.broadinstitute.sting.atk.LocusContext - public Integer map(List rodData, char ref, LocusIterator context) { + public Integer map(List rodData, char ref, LocusContext context) { //System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size()); //for ( SAMRecord read : context.getReads() ) { // System.out.println(" -> " + read.getReadName()); diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/SingleSampleGenotyper.java b/playground/java/src/org/broadinstitute/sting/atk/modules/SingleSampleGenotyper.java index 5363f417c..f9c9f7a4b 100644 --- a/playground/java/src/org/broadinstitute/sting/atk/modules/SingleSampleGenotyper.java +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/SingleSampleGenotyper.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules; import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.Utils; @@ -19,7 +20,7 @@ public class SingleSampleGenotyper implements LocusWalker { public String walkerType() { return "ByLocus"; } // Do we actually want to operate on the context? - public boolean filter(List rodData, char ref, LocusIterator context) { + public boolean filter(List rodData, char ref, LocusContext context) { return true; // We are keeping all the reads } @@ -86,7 +87,7 @@ public class SingleSampleGenotyper implements LocusWalker { } // Map over the org.broadinstitute.sting.atk.LocusContext - public Integer map(List rodData, char ref, LocusIterator context) { + public Integer map(List rodData, char ref, LocusContext context) { //System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size()); //for ( SAMRecord read : context.getReads() ) { // System.out.println(" -> " + read.getReadName()); diff --git a/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index acedd4209..d263d33a9 100644 --- a/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/playground/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -23,11 +23,19 @@ public class GenomeLoc implements Comparable { // Ugly global variable defining the optional ordering of contig elements // public static HashMap refContigOrdering = null; + public static HashMap interns = null; + public static void setContigOrdering(HashMap rco) { refContigOrdering = rco; + interns = new HashMap(); + for ( String contig : rco.keySet() ) + interns.put( contig, contig ); } - public GenomeLoc( final String contig, final long start, final long stop ) { + public GenomeLoc( String contig, final long start, final long stop ) { + if ( interns != null ) + contig = interns.get(contig); + this.contig = contig; this.start = start; this.stop = stop; @@ -37,12 +45,16 @@ public class GenomeLoc implements Comparable { this( contig, pos, pos ); } + public GenomeLoc( final GenomeLoc toCopy ) { + this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() ); + } + // // Parsing string representations // private static long parsePosition( final String pos ) { String x = pos.replaceAll(",", ""); - return Long.parseLong(x); + return Long.parseLong(x); } public static GenomeLoc parseGenomeLoc( final String str ) { @@ -57,7 +69,7 @@ public class GenomeLoc implements Comparable { long start = 1; long stop = Integer.MAX_VALUE; boolean bad = false; - + Matcher match1 = regex1.matcher(str); Matcher match2 = regex2.matcher(str); Matcher match3 = regex3.matcher(str); @@ -133,7 +145,7 @@ public class GenomeLoc implements Comparable { if ( that.start > this.stop ) return true; // that guy is past our start return false; } - + public final boolean overlapsP(GenomeLoc that) { return ! disjointP( that ); } @@ -142,11 +154,41 @@ public class GenomeLoc implements Comparable { return this.contig.equals(that.contig); } + public final int minus( final GenomeLoc that ) { + if ( this.getContig().equals(that.getContig()) ) + return (int) (this.getStart() - that.getStart()); + else + return Integer.MAX_VALUE; + } + public final int distance( final GenomeLoc that ) { + return Math.abs(minus(that)); + } + + public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { + return this.compareTo(left) > -1 && this.compareTo(right) < 1; + } + + public final void incPos() { + incPos(1); + } + public final void incPos(long by) { + this.start += by; + this.stop += by; + } + + public final GenomeLoc nextLoc() { + GenomeLoc n = new GenomeLoc(this); + n.incPos(); + return n; + } // // Comparison operations // public static int compareContigs( final String thisContig, final String thatContig ) { + if ( thisContig == thatContig ) + return 0; + if ( refContigOrdering != null ) { if ( ! refContigOrdering.containsKey(thisContig) ) { if ( ! refContigOrdering.containsKey(thatContig) ) { @@ -192,4 +234,4 @@ public class GenomeLoc implements Comparable { if ( this.getStop() > that.getStop() ) return 1; return 0; } -} +} \ No newline at end of file diff --git a/playground/java/src/org/broadinstitute/sting/utils/Utils.java b/playground/java/src/org/broadinstitute/sting/utils/Utils.java index 52ba6d81b..6458459bc 100755 --- a/playground/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/playground/java/src/org/broadinstitute/sting/utils/Utils.java @@ -68,8 +68,15 @@ public class Utils { return ret.toString(); } - public static String join(String separator, Collection strings) { - return join( separator, strings.toArray(new String[0]) ); + //public static String join(String separator, Collection strings) { + // return join( separator, strings.toArray(new String[0]) ); + //} + + public static String join(String separator, Collection objects) { + ArrayList strs = new ArrayList(); + for ( Object x : objects ) + strs.add(x.toString()); + return join( separator, strs.toArray(new String[0]) ); } public static double average(List vals, int maxI) { @@ -97,14 +104,16 @@ public class Utils { List refContigs = refFile.getSequenceDictionary(); HashMap refContigOrdering = new HashMap(); - int i = 0; - System.out.printf("Prepared reference sequence contig dictionary%n order ->"); - for ( SAMSequenceRecord contig : refContigs ) { - System.out.printf(" %s", contig.getSequenceName()); - refContigOrdering.put(contig.getSequenceName(), i); - i++; + if ( refContigs != null ) { + int i = 0; + System.out.printf("Prepared reference sequence contig dictionary%n order ->"); + for ( SAMSequenceRecord contig : refContigs ) { + System.out.printf(" %s", contig.getSequenceName()); + refContigOrdering.put(contig.getSequenceName(), i); + i++; + } + System.out.printf("%n Total elements -> %d%n", refContigOrdering.size()); } - System.out.printf("%n Total elements -> %d%n", refContigOrdering.size()); GenomeLoc.setContigOrdering(refContigOrdering); }