Added support for HangingLocusIterator

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@42 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-12 23:30:19 +00:00
parent 8a63606e11
commit 04befb942e
13 changed files with 103 additions and 161 deletions

View File

@ -40,6 +40,7 @@ public class AnalysisTK extends CommandLineProgram {
addModule("Genotype", new GenotypeWalker());
addModule("SingleSampleGenotyper", new SingleSampleGenotyper());
addModule("Null", new NullWalker());
addModule("DepthOfCoverage", new DepthOfCoverageWalker());
}
private TraversalEngine engine = null;

View File

@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord;
import java.util.List;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -11,18 +13,17 @@ import java.util.List;
* Time: 3:01:34 PM
* To change this template use File | Settings | File Templates.
*/
public class LocusContext {
public LocusContext() { };
// How big is the current context?
public int getLength() { return 1; }
// get the reference base at the current (relative) position
public byte getReferenceBase() { return 0; }
public interface LocusContext {
// get all of the reads within this context
public List<SAMRecord> getReads() { return null; }
public List<SAMRecord> getReads();
// get a list of the equivalent positions within in the reads at Pos
public List<Integer> getOffsets() { return null; }
public List<Integer> getOffsets();
public String getContig();
public long getPosition();
public GenomeLoc getLocation();
public int numReads();
}

View File

@ -14,37 +14,13 @@ import java.util.Iterator;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
public class LocusIterator implements Iterable<LocusIterator>, CloseableIterator<LocusIterator> {
// -----------------------------------------------------------------------------------------------------------------
//
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
private final PushbackIterator<SAMRecord> it;
private String contig = null;
private int position = -1;
private List<SAMRecord> reads = new ArrayList<SAMRecord>(100);
private List<Integer> offsets = new ArrayList<Integer>(100);
protected String getContig() { return contig; }
protected long getPosition() { return position; }
public GenomeLoc getLocation() { return new GenomeLoc(contig, position); }
public List<SAMRecord> getReads() { return reads; }
public List<Integer> getOffsets() { return offsets; }
public int numReads() { return reads.size(); }
public abstract class LocusIterator implements Iterable<LocusContext>, CloseableIterator<LocusContext> {
// -----------------------------------------------------------------------------------------------------------------
//
// constructors and other basic operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIterator(final CloseableIterator<SAMRecord> samIterator) {
this.it = new PushbackIterator<SAMRecord>(samIterator);
}
public Iterator<LocusIterator> iterator() {
public Iterator<LocusContext> iterator() {
return this;
}
@ -52,95 +28,8 @@ public class LocusIterator implements Iterable<LocusIterator>, CloseableIterator
//this.it.close();
}
public boolean hasNext() {
return it.hasNext();
}
// -----------------------------------------------------------------------------------------------------------------
//
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIterator next() {
position += 1;
if ( position != -1 ) {
cleanReads();
expandReads();
}
if ( reads.isEmpty() ) {
// the window is empty, we need to jump to the first pos of the first read in the stream
SAMRecord read = it.next();
pushRead(read);
contig = read.getReferenceName();
position = read.getAlignmentStart() - 1;
return next();
}
else {
// at this point, window contains all reads covering the pos, we need to return them
// and the offsets into each read for this loci
calcOffsetsOfWindow(position);
return this;
}
}
private void pushRead(SAMRecord read) {
//System.out.printf(" -> Adding read %s %d-%d flags %s%n", read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), Utils.readFlagsAsString(read));
reads.add(read);
}
class KeepReadPFunc implements Predicate<SAMRecord> {
public boolean apply(SAMRecord read) {
return position >= read.getAlignmentStart() &&
position < read.getAlignmentEnd() &&
read.getReferenceName().equals(contig); // should be index for efficiency
}
}
Predicate KeepReadP = new LocusIterator.KeepReadPFunc();
private void calcOffsetsOfWindow(final int position) {
offsets.clear();
for ( SAMRecord read : reads ) {
// def calcOffset( read ):
// offset = self.pos - read.start
// return offset
//
// offsets = map(calcOffset, self.window)
final int offset = position - read.getAlignmentStart();
assert(offset < read.getReadLength() );
offsets.add(offset);
//System.out.printf("offsets [%d] %s%n", read.getAlignmentStart(), offsets);
}
}
private void cleanReads() {
// def keepReadP( read ):
// return read.chr == chr and pos >= read.start and pos <= read.end
// self.window = filter( keepReadP, self.window )
reads = Utils.filter(KeepReadP, reads);
}
private void expandReads() {
// for read in self.rs:
// #print 'read', read, pos
// if read.chr == chr and read.start <= pos and read.end >= pos:
// self.pushRead(read)
// else:
// self.rs.unget( read )
// #self.rs = chain( [read], self.rs )
// break
while ( it.hasNext() ) {
SAMRecord read = it.next();
if ( KeepReadP.apply( read ) ) {
pushRead(read);
}
else {
it.pushback(read);
break;
}
}
}
public abstract boolean hasNext();
public abstract LocusContext next();
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");

View File

@ -17,10 +17,10 @@ public interface LocusWalker<MapType, ReduceType> {
public String walkerType();
// Do we actually want to operate on the context?
boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
// Map over the org.broadinstitute.sting.atk.LocusContext
MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
// Given result of map function
ReduceType reduceInit();

View File

@ -383,12 +383,6 @@ public class TraversalEngine {
result = true;
why = "No alignment start";
}
else if ( rec.getCigar().numCigarElements() > 1 ) {
// FIXME -- deal with indels correctly!
nSkippedIndels++;
result = true;
why = "Skipping indel: " + rec.getCigarString();
}
else {
result = false;
}
@ -417,7 +411,8 @@ public class TraversalEngine {
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
// prepare the read filtering read iterator and provide it to a new locus iterator
FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc());
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter);
//LocusIterator iter = new SingleLocusIterator(filterIter);
LocusIterator iter = new LocusIteratorByHanger(filterIter);
// Initial the reference ordered data iterators
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
@ -432,7 +427,7 @@ public class TraversalEngine {
this.nRecords++;
// actually get the read and hand it to the walker
final LocusIterator locus = iter.next();
final LocusContext locus = iter.next();
// Poor man's version of index LOL
if ( inLocations(locus.getLocation()) ) {

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord;
@ -22,7 +23,7 @@ public abstract class BasicLociWalker<MapType, ReduceType> implements LocusWalke
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
@ -30,7 +31,7 @@ public abstract class BasicLociWalker<MapType, ReduceType> implements LocusWalke
}
// These three capabilities must be overidden
public abstract MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
public abstract MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);
public abstract ReduceType reduceInit();
public abstract ReduceType reduce(MapType value, ReduceType sum);

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import java.util.List;
@ -13,7 +13,7 @@ import java.util.List;
* To change this template use File | Settings | File Templates.
*/
public class CountLociWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return 1;
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.GenotypeEvidence;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord;
@ -10,7 +11,7 @@ import java.util.List;
import static java.lang.System.currentTimeMillis;
public class GenotypeWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//char[] = new char(26);
long start_tm = currentTimeMillis();
List<SAMRecord> reads = context.getReads();

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils;
@ -19,12 +20,12 @@ public class NullWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
// Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context)
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context)
{
return 1;
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils;
@ -23,12 +24,12 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
// Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
//for ( SAMRecord read : context.getReads() ) {
// System.out.println(" -> " + read.getReadName());

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusWalker;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.rodDbSNP;
import org.broadinstitute.sting.utils.Utils;
@ -19,7 +20,7 @@ public class SingleSampleGenotyper implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
@ -86,7 +87,7 @@ public class SingleSampleGenotyper implements LocusWalker<Integer, Integer> {
}
// Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
//for ( SAMRecord read : context.getReads() ) {
// System.out.println(" -> " + read.getReadName());

View File

@ -23,11 +23,19 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
// Ugly global variable defining the optional ordering of contig elements
//
public static HashMap<String, Integer> refContigOrdering = null;
public static HashMap<String, String> interns = null;
public static void setContigOrdering(HashMap<String, Integer> rco) {
refContigOrdering = rco;
interns = new HashMap<String, String>();
for ( String contig : rco.keySet() )
interns.put( contig, contig );
}
public GenomeLoc( final String contig, final long start, final long stop ) {
public GenomeLoc( String contig, final long start, final long stop ) {
if ( interns != null )
contig = interns.get(contig);
this.contig = contig;
this.start = start;
this.stop = stop;
@ -37,12 +45,16 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
this( contig, pos, pos );
}
public GenomeLoc( final GenomeLoc toCopy ) {
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
}
//
// Parsing string representations
//
private static long parsePosition( final String pos ) {
String x = pos.replaceAll(",", "");
return Long.parseLong(x);
return Long.parseLong(x);
}
public static GenomeLoc parseGenomeLoc( final String str ) {
@ -57,7 +69,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
long start = 1;
long stop = Integer.MAX_VALUE;
boolean bad = false;
Matcher match1 = regex1.matcher(str);
Matcher match2 = regex2.matcher(str);
Matcher match3 = regex3.matcher(str);
@ -133,7 +145,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
if ( that.start > this.stop ) return true; // that guy is past our start
return false;
}
public final boolean overlapsP(GenomeLoc that) {
return ! disjointP( that );
}
@ -142,11 +154,41 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return this.contig.equals(that.contig);
}
public final int minus( final GenomeLoc that ) {
if ( this.getContig().equals(that.getContig()) )
return (int) (this.getStart() - that.getStart());
else
return Integer.MAX_VALUE;
}
public final int distance( final GenomeLoc that ) {
return Math.abs(minus(that));
}
public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
}
public final void incPos() {
incPos(1);
}
public final void incPos(long by) {
this.start += by;
this.stop += by;
}
public final GenomeLoc nextLoc() {
GenomeLoc n = new GenomeLoc(this);
n.incPos();
return n;
}
//
// Comparison operations
//
public static int compareContigs( final String thisContig, final String thatContig ) {
if ( thisContig == thatContig )
return 0;
if ( refContigOrdering != null ) {
if ( ! refContigOrdering.containsKey(thisContig) ) {
if ( ! refContigOrdering.containsKey(thatContig) ) {
@ -192,4 +234,4 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
if ( this.getStop() > that.getStop() ) return 1;
return 0;
}
}
}

View File

@ -68,8 +68,15 @@ public class Utils {
return ret.toString();
}
public static String join(String separator, Collection<String> strings) {
return join( separator, strings.toArray(new String[0]) );
//public static String join(String separator, Collection<String> strings) {
// return join( separator, strings.toArray(new String[0]) );
//}
public static <T> String join(String separator, Collection<T> objects) {
ArrayList<String> strs = new ArrayList<String>();
for ( Object x : objects )
strs.add(x.toString());
return join( separator, strs.toArray(new String[0]) );
}
public static double average(List<Long> vals, int maxI) {
@ -97,14 +104,16 @@ public class Utils {
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
int i = 0;
System.out.printf("Prepared reference sequence contig dictionary%n order ->");
for ( SAMSequenceRecord contig : refContigs ) {
System.out.printf(" %s", contig.getSequenceName());
refContigOrdering.put(contig.getSequenceName(), i);
i++;
if ( refContigs != null ) {
int i = 0;
System.out.printf("Prepared reference sequence contig dictionary%n order ->");
for ( SAMSequenceRecord contig : refContigs ) {
System.out.printf(" %s", contig.getSequenceName());
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
System.out.printf("%n Total elements -> %d%n", refContigOrdering.size());
}
System.out.printf("%n Total elements -> %d%n", refContigOrdering.size());
GenomeLoc.setContigOrdering(refContigOrdering);
}