Added support for HangingLocusIterator

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@42 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-12 23:30:19 +00:00
parent 8a63606e11
commit 04befb942e
13 changed files with 103 additions and 161 deletions

View File

@ -40,6 +40,7 @@ public class AnalysisTK extends CommandLineProgram {
addModule("Genotype", new GenotypeWalker()); addModule("Genotype", new GenotypeWalker());
addModule("SingleSampleGenotyper", new SingleSampleGenotyper()); addModule("SingleSampleGenotyper", new SingleSampleGenotyper());
addModule("Null", new NullWalker()); addModule("Null", new NullWalker());
addModule("DepthOfCoverage", new DepthOfCoverageWalker());
} }
private TraversalEngine engine = null; private TraversalEngine engine = null;

View File

@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord;
import java.util.List; import java.util.List;
import org.broadinstitute.sting.utils.GenomeLoc;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
* User: mdepristo * User: mdepristo
@ -11,18 +13,17 @@ import java.util.List;
* Time: 3:01:34 PM * Time: 3:01:34 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class LocusContext { public interface LocusContext {
public LocusContext() { };
// How big is the current context?
public int getLength() { return 1; }
// get the reference base at the current (relative) position
public byte getReferenceBase() { return 0; }
// get all of the reads within this context // get all of the reads within this context
public List<SAMRecord> getReads() { return null; } public List<SAMRecord> getReads();
// get a list of the equivalent positions within in the reads at Pos // get a list of the equivalent positions within in the reads at Pos
public List<Integer> getOffsets() { return null; } public List<Integer> getOffsets();
public String getContig();
public long getPosition();
public GenomeLoc getLocation();
public int numReads();
} }

View File

@ -14,37 +14,13 @@ import java.util.Iterator;
/** /**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis * Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/ */
public class LocusIterator implements Iterable<LocusIterator>, CloseableIterator<LocusIterator> { public abstract class LocusIterator implements Iterable<LocusContext>, CloseableIterator<LocusContext> {
// -----------------------------------------------------------------------------------------------------------------
//
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
private final PushbackIterator<SAMRecord> it;
private String contig = null;
private int position = -1;
private List<SAMRecord> reads = new ArrayList<SAMRecord>(100);
private List<Integer> offsets = new ArrayList<Integer>(100);
protected String getContig() { return contig; }
protected long getPosition() { return position; }
public GenomeLoc getLocation() { return new GenomeLoc(contig, position); }
public List<SAMRecord> getReads() { return reads; }
public List<Integer> getOffsets() { return offsets; }
public int numReads() { return reads.size(); }
// ----------------------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------------------
// //
// constructors and other basic operations // constructors and other basic operations
// //
// ----------------------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------------------
public LocusIterator(final CloseableIterator<SAMRecord> samIterator) { public Iterator<LocusContext> iterator() {
this.it = new PushbackIterator<SAMRecord>(samIterator);
}
public Iterator<LocusIterator> iterator() {
return this; return this;
} }
@ -52,95 +28,8 @@ public class LocusIterator implements Iterable<LocusIterator>, CloseableIterator
//this.it.close(); //this.it.close();
} }
public boolean hasNext() { public abstract boolean hasNext();
return it.hasNext(); public abstract LocusContext next();
}
// -----------------------------------------------------------------------------------------------------------------
//
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIterator next() {
position += 1;
if ( position != -1 ) {
cleanReads();
expandReads();
}
if ( reads.isEmpty() ) {
// the window is empty, we need to jump to the first pos of the first read in the stream
SAMRecord read = it.next();
pushRead(read);
contig = read.getReferenceName();
position = read.getAlignmentStart() - 1;
return next();
}
else {
// at this point, window contains all reads covering the pos, we need to return them
// and the offsets into each read for this loci
calcOffsetsOfWindow(position);
return this;
}
}
private void pushRead(SAMRecord read) {
//System.out.printf(" -> Adding read %s %d-%d flags %s%n", read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), Utils.readFlagsAsString(read));
reads.add(read);
}
class KeepReadPFunc implements Predicate<SAMRecord> {
public boolean apply(SAMRecord read) {
return position >= read.getAlignmentStart() &&
position < read.getAlignmentEnd() &&
read.getReferenceName().equals(contig); // should be index for efficiency
}
}
Predicate KeepReadP = new LocusIterator.KeepReadPFunc();
private void calcOffsetsOfWindow(final int position) {
offsets.clear();
for ( SAMRecord read : reads ) {
// def calcOffset( read ):
// offset = self.pos - read.start
// return offset
//
// offsets = map(calcOffset, self.window)
final int offset = position - read.getAlignmentStart();
assert(offset < read.getReadLength() );
offsets.add(offset);
//System.out.printf("offsets [%d] %s%n", read.getAlignmentStart(), offsets);
}
}
private void cleanReads() {
// def keepReadP( read ):
// return read.chr == chr and pos >= read.start and pos <= read.end
// self.window = filter( keepReadP, self.window )
reads = Utils.filter(KeepReadP, reads);
}
private void expandReads() {
// for read in self.rs:
// #print 'read', read, pos
// if read.chr == chr and read.start <= pos and read.end >= pos:
// self.pushRead(read)
// else:
// self.rs.unget( read )
// #self.rs = chain( [read], self.rs )
// break
while ( it.hasNext() ) {
SAMRecord read = it.next();
if ( KeepReadP.apply( read ) ) {
pushRead(read);
}
else {
it.pushback(read);
break;
}
}
}
public void remove() { public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");

View File

@ -17,10 +17,10 @@ public interface LocusWalker<MapType, ReduceType> {
public String walkerType(); public String walkerType();
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context); boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
// Map over the org.broadinstitute.sting.atk.LocusContext // Map over the org.broadinstitute.sting.atk.LocusContext
MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context); MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
// Given result of map function // Given result of map function
ReduceType reduceInit(); ReduceType reduceInit();

View File

@ -383,12 +383,6 @@ public class TraversalEngine {
result = true; result = true;
why = "No alignment start"; why = "No alignment start";
} }
else if ( rec.getCigar().numCigarElements() > 1 ) {
// FIXME -- deal with indels correctly!
nSkippedIndels++;
result = true;
why = "Skipping indel: " + rec.getCigarString();
}
else { else {
result = false; result = false;
} }
@ -417,7 +411,8 @@ public class TraversalEngine {
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) { protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
// prepare the read filtering read iterator and provide it to a new locus iterator // prepare the read filtering read iterator and provide it to a new locus iterator
FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc()); FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc());
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter); //LocusIterator iter = new SingleLocusIterator(filterIter);
LocusIterator iter = new LocusIteratorByHanger(filterIter);
// Initial the reference ordered data iterators // Initial the reference ordered data iterators
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs(); List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
@ -432,7 +427,7 @@ public class TraversalEngine {
this.nRecords++; this.nRecords++;
// actually get the read and hand it to the walker // actually get the read and hand it to the walker
final LocusIterator locus = iter.next(); final LocusContext locus = iter.next();
// Poor man's version of index LOL // Poor man's version of index LOL
if ( inLocations(locus.getLocation()) ) { if ( inLocations(locus.getLocation()) ) {

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -22,7 +23,7 @@ public abstract class BasicLociWalker<MapType, ReduceType> implements LocusWalke
public String walkerType() { return "ByLocus"; } public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads return true; // We are keeping all the reads
} }
@ -30,7 +31,7 @@ public abstract class BasicLociWalker<MapType, ReduceType> implements LocusWalke
} }
// These three capabilities must be overidden // These three capabilities must be overidden
public abstract MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context); public abstract MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
public abstract ReduceType reduceInit(); public abstract ReduceType reduceInit();
public abstract ReduceType reduce(MapType value, ReduceType sum); public abstract ReduceType reduce(MapType value, ReduceType sum);

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.atk.modules; package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import java.util.List; import java.util.List;
@ -13,7 +13,7 @@ import java.util.List;
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class CountLociWalker extends BasicLociWalker<Integer, Integer> { public class CountLociWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return 1; return 1;
} }

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.GenotypeEvidence; import org.broadinstitute.sting.atk.GenotypeEvidence;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -10,7 +11,7 @@ import java.util.List;
import static java.lang.System.currentTimeMillis; import static java.lang.System.currentTimeMillis;
public class GenotypeWalker extends BasicLociWalker<Integer, Integer> { public class GenotypeWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//char[] = new char(26); //char[] = new char(26);
long start_tm = currentTimeMillis(); long start_tm = currentTimeMillis();
List<SAMRecord> reads = context.getReads(); List<SAMRecord> reads = context.getReads();

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -19,12 +20,12 @@ public class NullWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; } public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads return true; // We are keeping all the reads
} }
// Map over the org.broadinstitute.sting.atk.LocusContext // Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context)
{ {
return 1; return 1;
} }

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -23,12 +24,12 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; } public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads return true; // We are keeping all the reads
} }
// Map over the org.broadinstitute.sting.atk.LocusContext // Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size()); //System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
//for ( SAMRecord read : context.getReads() ) { //for ( SAMRecord read : context.getReads() ) {
// System.out.println(" -> " + read.getReadName()); // System.out.println(" -> " + read.getReadName());

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker; import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator; import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP; import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -19,7 +20,7 @@ public class SingleSampleGenotyper implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; } public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads return true; // We are keeping all the reads
} }
@ -86,7 +87,7 @@ public class SingleSampleGenotyper implements LocusWalker<Integer, Integer> {
} }
// Map over the org.broadinstitute.sting.atk.LocusContext // Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) { public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size()); //System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
//for ( SAMRecord read : context.getReads() ) { //for ( SAMRecord read : context.getReads() ) {
// System.out.println(" -> " + read.getReadName()); // System.out.println(" -> " + read.getReadName());

View File

@ -23,11 +23,19 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
// Ugly global variable defining the optional ordering of contig elements // Ugly global variable defining the optional ordering of contig elements
// //
public static HashMap<String, Integer> refContigOrdering = null; public static HashMap<String, Integer> refContigOrdering = null;
public static HashMap<String, String> interns = null;
public static void setContigOrdering(HashMap<String, Integer> rco) { public static void setContigOrdering(HashMap<String, Integer> rco) {
refContigOrdering = rco; refContigOrdering = rco;
interns = new HashMap<String, String>();
for ( String contig : rco.keySet() )
interns.put( contig, contig );
} }
public GenomeLoc( final String contig, final long start, final long stop ) { public GenomeLoc( String contig, final long start, final long stop ) {
if ( interns != null )
contig = interns.get(contig);
this.contig = contig; this.contig = contig;
this.start = start; this.start = start;
this.stop = stop; this.stop = stop;
@ -37,12 +45,16 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
this( contig, pos, pos ); this( contig, pos, pos );
} }
public GenomeLoc( final GenomeLoc toCopy ) {
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
}
// //
// Parsing string representations // Parsing string representations
// //
private static long parsePosition( final String pos ) { private static long parsePosition( final String pos ) {
String x = pos.replaceAll(",", ""); String x = pos.replaceAll(",", "");
return Long.parseLong(x); return Long.parseLong(x);
} }
public static GenomeLoc parseGenomeLoc( final String str ) { public static GenomeLoc parseGenomeLoc( final String str ) {
@ -57,7 +69,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
long start = 1; long start = 1;
long stop = Integer.MAX_VALUE; long stop = Integer.MAX_VALUE;
boolean bad = false; boolean bad = false;
Matcher match1 = regex1.matcher(str); Matcher match1 = regex1.matcher(str);
Matcher match2 = regex2.matcher(str); Matcher match2 = regex2.matcher(str);
Matcher match3 = regex3.matcher(str); Matcher match3 = regex3.matcher(str);
@ -133,7 +145,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
if ( that.start > this.stop ) return true; // that guy is past our start if ( that.start > this.stop ) return true; // that guy is past our start
return false; return false;
} }
public final boolean overlapsP(GenomeLoc that) { public final boolean overlapsP(GenomeLoc that) {
return ! disjointP( that ); return ! disjointP( that );
} }
@ -142,11 +154,41 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return this.contig.equals(that.contig); return this.contig.equals(that.contig);
} }
public final int minus( final GenomeLoc that ) {
if ( this.getContig().equals(that.getContig()) )
return (int) (this.getStart() - that.getStart());
else
return Integer.MAX_VALUE;
}
public final int distance( final GenomeLoc that ) {
return Math.abs(minus(that));
}
public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
}
public final void incPos() {
incPos(1);
}
public final void incPos(long by) {
this.start += by;
this.stop += by;
}
public final GenomeLoc nextLoc() {
GenomeLoc n = new GenomeLoc(this);
n.incPos();
return n;
}
// //
// Comparison operations // Comparison operations
// //
public static int compareContigs( final String thisContig, final String thatContig ) { public static int compareContigs( final String thisContig, final String thatContig ) {
if ( thisContig == thatContig )
return 0;
if ( refContigOrdering != null ) { if ( refContigOrdering != null ) {
if ( ! refContigOrdering.containsKey(thisContig) ) { if ( ! refContigOrdering.containsKey(thisContig) ) {
if ( ! refContigOrdering.containsKey(thatContig) ) { if ( ! refContigOrdering.containsKey(thatContig) ) {
@ -192,4 +234,4 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
if ( this.getStop() > that.getStop() ) return 1; if ( this.getStop() > that.getStop() ) return 1;
return 0; return 0;
} }
} }

View File

@ -68,8 +68,15 @@ public class Utils {
return ret.toString(); return ret.toString();
} }
public static String join(String separator, Collection<String> strings) { //public static String join(String separator, Collection<String> strings) {
return join( separator, strings.toArray(new String[0]) ); // return join( separator, strings.toArray(new String[0]) );
//}
public static <T> String join(String separator, Collection<T> objects) {
ArrayList<String> strs = new ArrayList<String>();
for ( Object x : objects )
strs.add(x.toString());
return join( separator, strs.toArray(new String[0]) );
} }
public static double average(List<Long> vals, int maxI) { public static double average(List<Long> vals, int maxI) {
@ -97,14 +104,16 @@ public class Utils {
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary(); List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>(); HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
int i = 0; if ( refContigs != null ) {
System.out.printf("Prepared reference sequence contig dictionary%n order ->"); int i = 0;
for ( SAMSequenceRecord contig : refContigs ) { System.out.printf("Prepared reference sequence contig dictionary%n order ->");
System.out.printf(" %s", contig.getSequenceName()); for ( SAMSequenceRecord contig : refContigs ) {
refContigOrdering.put(contig.getSequenceName(), i); System.out.printf(" %s", contig.getSequenceName());
i++; refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
System.out.printf("%n Total elements -> %d%n", refContigOrdering.size());
} }
System.out.printf("%n Total elements -> %d%n", refContigOrdering.size());
GenomeLoc.setContigOrdering(refContigOrdering); GenomeLoc.setContigOrdering(refContigOrdering);
} }