diff --git a/java/jars/functionalj.jar b/java/jars/functionalj.jar new file mode 100644 index 000000000..5d41cbdb7 Binary files /dev/null and b/java/jars/functionalj.jar differ diff --git a/java/lib/edu/mit/broad/picard/util/SequenceUtil.java b/java/lib/edu/mit/broad/picard/util/SequenceUtil.java index d0a7937f8..e847611c6 100644 --- a/java/lib/edu/mit/broad/picard/util/SequenceUtil.java +++ b/java/lib/edu/mit/broad/picard/util/SequenceUtil.java @@ -41,6 +41,20 @@ public class SequenceUtil { return new String(complement); } + public static char complementBase(char base) { + switch ( base ) { + case 'a' : return 't'; + case 'A' : return 'T'; + case 'c' : return 'g'; + case 'C' : return 'G'; + case 'g' : return 'c'; + case 'G' : return 'C'; + case 't' : return 'a'; + case 'T' : return 'A'; + default : return base; + } + } + /** Attempts to efficiently compare two bases stored as bytes for equality. */ public static boolean basesEqual(byte lhs, byte rhs) { if (lhs == rhs) return true; diff --git a/java/src/edu/mit/broad/sting/atk/AnalysisTK.java b/java/src/edu/mit/broad/sting/atk/AnalysisTK.java index 0d7762800..4d5789515 100644 --- a/java/src/edu/mit/broad/sting/atk/AnalysisTK.java +++ b/java/src/edu/mit/broad/sting/atk/AnalysisTK.java @@ -35,6 +35,7 @@ public class AnalysisTK extends CommandLineProgram { addModule("CountLoci", new CountLociWalker()); addModule("Pileup", new PileupWalker()); addModule("CountReads", new CountReadsWalker()); + addModule("PrintReads", new PrintReadsWalker()); addModule("Base_Quality_Histogram", new BaseQualityHistoWalker()); } @@ -69,6 +70,7 @@ public class AnalysisTK extends CommandLineProgram { } this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods); + engine.initialize(); ValidationStringency strictness; if ( STRICTNESS_ARG == null ) { @@ -89,8 +91,11 @@ public class AnalysisTK extends CommandLineProgram { engine.setDebugging(! ( DEBUGGING_STR == null || DEBUGGING_STR.toLowerCase().equals("true"))); engine.setMaxReads(Integer.parseInt(MAX_READS_ARG)); + if ( REGION_STR != null ) { + engine.setLocation(REGION_STR); + } + //LocusWalker walker = new PileupWalker(); - engine.initialize(); try { LocusWalker walker = (LocusWalker)MODULES.get(Analysis_Name); engine.traverseByLoci(walker); diff --git a/java/src/edu/mit/broad/sting/atk/LocusIterator.java b/java/src/edu/mit/broad/sting/atk/LocusIterator.java index 83c93b8ed..2f6fa516e 100755 --- a/java/src/edu/mit/broad/sting/atk/LocusIterator.java +++ b/java/src/edu/mit/broad/sting/atk/LocusIterator.java @@ -5,6 +5,7 @@ import edu.mit.broad.sam.SAMRecord; import edu.mit.broad.sting.utils.PushbackIterator; import edu.mit.broad.sting.utils.Utils; import edu.mit.broad.sting.utils.Predicate; +import edu.mit.broad.sting.utils.GenomeLoc; import java.util.List; import java.util.ArrayList; @@ -26,8 +27,10 @@ public class LocusIterator implements Iterable, CloseableIterator private List reads = new ArrayList(100); private List offsets = new ArrayList(100); - public String getContig() { return contig; } - public int getPosition() { return position; } + protected String getContig() { return contig; } + protected long getPosition() { return position; } + public GenomeLoc getLocation() { return new GenomeLoc(contig, position); } + public List getReads() { return reads; } public List getOffsets() { return offsets; } diff --git a/java/src/edu/mit/broad/sting/atk/PrepareROD.java b/java/src/edu/mit/broad/sting/atk/PrepareROD.java index 991620eab..74c677d04 100644 --- a/java/src/edu/mit/broad/sting/atk/PrepareROD.java +++ b/java/src/edu/mit/broad/sting/atk/PrepareROD.java @@ -55,7 +55,7 @@ public class PrepareROD extends CommandLineProgram { refContigOrdering.put(contig.getSequenceName(), i); i++; } - ReferenceOrderedDatum.setContigOrdering(refContigOrdering); + GenomeLoc.setContigOrdering(refContigOrdering); Class rodClass = Types.get(ROD_TYPE.toLowerCase()); diff --git a/java/src/edu/mit/broad/sting/atk/TraversalEngine.java b/java/src/edu/mit/broad/sting/atk/TraversalEngine.java index 68f17c91a..794097300 100755 --- a/java/src/edu/mit/broad/sting/atk/TraversalEngine.java +++ b/java/src/edu/mit/broad/sting/atk/TraversalEngine.java @@ -8,16 +8,17 @@ import edu.mit.broad.picard.filter.SamRecordFilter; import edu.mit.broad.picard.filter.FilteringIterator; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.sting.utils.ReferenceIterator; -import edu.mit.broad.sting.utils.ReferenceOrderedData; -import edu.mit.broad.sting.utils.ReferenceOrderedDatum; -import edu.mit.broad.sting.utils.Utils; +import edu.mit.broad.sting.utils.*; import java.io.*; -import java.util.List; -import java.util.Iterator; -import java.util.ArrayList; -import java.util.Arrays; +import java.util.*; + +import net.sf.functionalj.reflect.StdReflect; +import net.sf.functionalj.reflect.JdkStdReflect; +import net.sf.functionalj.FunctionN; +import net.sf.functionalj.Function1; +import net.sf.functionalj.Functions; +import net.sf.functionalj.util.Operators; public class TraversalEngine { // Usage and parameters @@ -46,6 +47,8 @@ public class TraversalEngine { public boolean DEBUGGING = false; + private GenomeLoc[] locs = null; + // -------------------------------------------------------------------------------------------------------------- // // Setting up the engine @@ -63,6 +66,54 @@ public class TraversalEngine { public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; } public void setDebugging( final boolean d ) { DEBUGGING = d; } + + // -------------------------------------------------------------------------------------------------------------- + // + // functions for dealing locations (areas of the genome we're traversing over) + // + // -------------------------------------------------------------------------------------------------------------- + public void setLocation( final String locStr ) { + this.locs = parseGenomeLocs(locStr); + } + + public static GenomeLoc[] parseGenomeLocs( final String str ) { + // Of the form: loc1;loc2;... + // Where each locN can be: + // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + StdReflect reflect = new JdkStdReflect(); + FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); + Function1 f1 = parseOne.f1(); + Collection result = Functions.map(f1, Arrays.asList(str.split(";"))); + GenomeLoc[] locs = (GenomeLoc[])result.toArray(new GenomeLoc[0]); + + Arrays.sort(locs); + for ( GenomeLoc l : locs ) + System.out.printf(" -> %s%n", l); + + System.out.printf(" Locations are: %s%n", Utils.join(" ", Functions.map( Operators.toString, Arrays.asList(locs) ) ) ); + + return locs; + } + + public boolean inLocations( GenomeLoc curr ) { + if ( this.locs == null ) + return true; + else { + for ( GenomeLoc loc : this.locs ) { + //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); + if ( loc.overlapsP(curr) ) + return true; + } + return false; + } + } + + public boolean pastFinalLocation( GenomeLoc curr ) { + boolean r = locs != null && locs[locs.length-1].compareTo( curr ) == -1 && ! locs[locs.length-1].overlapsP(curr); + //System.out.printf(" pastFinalLocation %s vs. %s => %d => %b%n", locs[locs.length-1], curr, locs[locs.length-1].compareTo( curr ), r); + return r; + } + // -------------------------------------------------------------------------------------------------------------- // // functions for dealing with the reference sequence @@ -127,17 +178,18 @@ public class TraversalEngine { } protected List getReferenceOrderedDataAtLocus(List rodIters, - final String contig, final int pos) { + final GenomeLoc loc) { List data = new ArrayList(); for ( ReferenceOrderedData.RODIterator iter : rodIters ) { - data.add(iter.seekForward(contig, pos)); + data.add(iter.seekForward(loc)); } return data; } + // -------------------------------------------------------------------------------------------------------------- // - // traversal functions + // traversal by loci functions // // -------------------------------------------------------------------------------------------------------------- protected int initialize() { @@ -197,30 +249,38 @@ public class TraversalEngine { List rodIters = initializeRODs(); T sum = walker.reduceInit(); - while ( iter.hasNext() ) { + boolean done = false; + while ( iter.hasNext() && ! done ) { this.nRecords++; // actually get the read and hand it to the walker final LocusIterator locus = iter.next(); - final ReferenceIterator refSite = refIter.seekForward(locus.getContig(), locus.getPosition()); - final char refBase = refSite.getBaseAsChar(); - final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getContig(), locus.getPosition()); - if ( DEBUGGING ) - System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase); + // Poor man's version of index LOL + if ( inLocations(locus.getLocation()) ) { + final ReferenceIterator refSite = refIter.seekForward(locus.getContig(), locus.getPosition()); + final char refBase = refSite.getBaseAsChar(); + final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation()); - final boolean keepMeP = walker.filter(rodData, refBase, locus); - if ( keepMeP ) { - M x = walker.map(rodData, refBase, locus); - sum = walker.reduce(x, sum); + if ( DEBUGGING ) + System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase); + + final boolean keepMeP = walker.filter(rodData, refBase, locus); + if ( keepMeP ) { + M x = walker.map(rodData, refBase, locus); + sum = walker.reduce(x, sum); + } + + if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { + System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); + done = true; + } + + printProgress("loci"); } - if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { - System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); - break; - } - - printProgress("loci"); + if ( pastFinalLocation(locus.getLocation()) ) + done = true; } printProgress( true, "loci" ); @@ -234,28 +294,43 @@ public class TraversalEngine { return 0; } + // -------------------------------------------------------------------------------------------------------------- + // + // traversal by read functions + // + // -------------------------------------------------------------------------------------------------------------- protected int traverseByRead(ReadWalker walker) { walker.initialize(); CloseableIterator iter = readStream.iterator(); R sum = walker.reduceInit(); - while ( iter.hasNext() ) { + boolean done = false; + while ( iter.hasNext() && ! done ) { this.nRecords++; // actually get the read and hand it to the walker final SAMRecord read = iter.next(); - final boolean keepMeP = walker.filter(null, read); - if ( keepMeP ) { - M x = walker.map(null, read); - sum = walker.reduce(x, sum); - } + GenomeLoc loc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); - if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { - System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); - break; - } + if ( inLocations(loc) ) { + final boolean keepMeP = walker.filter(null, read); - printProgress("reads"); - } + if ( keepMeP ) { + M x = walker.map(null, read); + sum = walker.reduce(x, sum); + } + + if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { + System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); + break; + } + + printProgress("reads"); + } + + if ( pastFinalLocation(loc) ) + done = true; + //System.out.printf("Done? %b%n", done); + } printProgress( true, "reads" ); System.out.println("Traversal reduce result is " + sum); diff --git a/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java new file mode 100755 index 000000000..daf38d3dc --- /dev/null +++ b/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java @@ -0,0 +1,37 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sting.atk.LocusWalker; +import edu.mit.broad.sting.atk.LocusIterator; +import edu.mit.broad.sting.utils.ReferenceOrderedDatum; +import edu.mit.broad.sam.SAMRecord; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class BasicLociWalker implements LocusWalker { + public void initialize() { + ; + } + + public String walkerType() { return "ByLocus"; } + + // Do we actually want to operate on the context? + public boolean filter(List rodData, char ref, LocusIterator context) { + return true; // We are keeping all the reads + } + + public void onTraveralDone() { + } + + // These three capabilities must be overidden + public abstract MapType map(List rodData, char ref, LocusIterator context); + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); + +} \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java new file mode 100755 index 000000000..6606b190a --- /dev/null +++ b/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java @@ -0,0 +1,31 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sting.atk.LocusContext; +import edu.mit.broad.sting.atk.ReadWalker; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class BasicReadWalker implements ReadWalker { + public void initialize() { } + public String walkerType() { return "ByRead"; } + + public boolean filter(LocusContext context, SAMRecord read) { + // We are keeping all the reads + return true; + } + + public void onTraveralDone() { + + } + + // Three basic abstract function that *must* be overridden + public abstract MapType map(LocusContext context, SAMRecord read); + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); +} \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/atk/modules/CountLociWalker.java b/java/src/edu/mit/broad/sting/atk/modules/CountLociWalker.java index 52cb0dff7..98ee2d01f 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/CountLociWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/CountLociWalker.java @@ -1,9 +1,7 @@ package edu.mit.broad.sting.atk.modules; -import edu.mit.broad.sting.atk.LocusWalker; import edu.mit.broad.sting.atk.LocusIterator; import edu.mit.broad.sting.utils.ReferenceOrderedDatum; -import edu.mit.broad.sam.SAMRecord; import java.util.List; @@ -14,28 +12,14 @@ import java.util.List; * Time: 3:22:14 PM * To change this template use File | Settings | File Templates. */ -public class CountLociWalker implements LocusWalker { - public void initialize() { - } - - public String walkerType() { return "ByLocus"; } - - // Do we actually want to operate on the context? - public boolean filter(List rodData, char ref, LocusIterator context) { - return true; // We are keeping all the reads - } - - // Map over the edu.mit.broad.sting.atk.LocusContext +public class CountLociWalker extends BasicLociWalker { public Integer map(List rodData, char ref, LocusIterator context) { return 1; } - // Given result of map function public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } - - public void onTraveralDone() { - } } \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java b/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java index 7da3e72e2..1ac0b38cf 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java @@ -1,38 +1,16 @@ package edu.mit.broad.sting.atk.modules; import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sting.atk.ReadWalker; import edu.mit.broad.sting.atk.LocusContext; -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 3:22:14 PM - * To change this template use File | Settings | File Templates. - */ -public class CountReadsWalker implements ReadWalker { - - public void initialize() { } - - public String walkerType() { return "ByRead"; } - - // Do we actually want to operate on the context? - public boolean filter(LocusContext context, SAMRecord read) { - return true; // We are keeping all the reads - } - - // Map over the edu.mit.broad.sting.atk.LocusContext +public class CountReadsWalker extends BasicReadWalker { public Integer map(LocusContext context, SAMRecord read) { return 1; } - // Given result of map function public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } - - public void onTraveralDone() { - } } diff --git a/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java b/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java index 4be761aec..f67e818c2 100644 --- a/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java @@ -3,6 +3,8 @@ package edu.mit.broad.sting.atk.modules; import edu.mit.broad.sting.atk.LocusWalker; import edu.mit.broad.sting.atk.LocusIterator; import edu.mit.broad.sting.utils.ReferenceOrderedDatum; +import edu.mit.broad.sting.utils.rodDbSNP; +import edu.mit.broad.sting.utils.Utils; import edu.mit.broad.sam.SAMRecord; import java.util.List; @@ -53,14 +55,21 @@ public class PileupWalker implements LocusWalker { String rodString = ""; for ( ReferenceOrderedDatum datum : rodData ) { if ( datum != null ) { - rodString += datum.toSimpleString(); + if ( datum instanceof rodDbSNP) { + rodDbSNP dbsnp = (rodDbSNP)datum; + //System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD())); + rodString += dbsnp.toMediumString(); + } + else { + rodString += datum.toSimpleString(); + } } } if ( rodString != "" ) rodString = "[ROD: " + rodString + "]"; - if ( context.getPosition() % 1 == 0 ) { - System.out.printf("%s:%d: %s %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals, rodString); + if ( context.getLocation().getStart() % 1 == 0 ) { + System.out.printf("%s: %s %s %s %s%n", context.getLocation(), ref, bases, quals, rodString); } //for ( int offset : context.getOffsets() ) { diff --git a/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java b/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java new file mode 100755 index 000000000..4cacbd3ab --- /dev/null +++ b/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java @@ -0,0 +1,17 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sting.atk.LocusContext; + +public class PrintReadsWalker extends BasicReadWalker { + public Integer map(LocusContext context, SAMRecord read) { + System.out.println(read.format()); + return 1; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/utils/GenomeLoc.java b/java/src/edu/mit/broad/sting/utils/GenomeLoc.java new file mode 100644 index 000000000..6c55928de --- /dev/null +++ b/java/src/edu/mit/broad/sting/utils/GenomeLoc.java @@ -0,0 +1,195 @@ +package edu.mit.broad.sting.utils; + +import java.util.*; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Mar 2, 2009 + * Time: 8:50:11 AM + * + * Genome location representation. It is *** 1 *** based + * + * + */ +public class GenomeLoc implements Comparable { + private String contig; + private long start; + private long stop; + + // + // Ugly global variable defining the optional ordering of contig elements + // + public static HashMap refContigOrdering = null; + public static void setContigOrdering(HashMap rco) { + refContigOrdering = rco; + } + + public GenomeLoc( final String contig, final long start, final long stop ) { + this.contig = contig; + this.start = start; + this.stop = stop; + } + + public GenomeLoc( final String contig, final long pos ) { + this( contig, pos, pos ); + } + + // + // Parsing string representations + // + private static long parsePosition( final String pos ) { + String x = pos.replaceAll(",", ""); + return Long.parseLong(x); + } + + public static GenomeLoc parseGenomeLoc( final String str ) { + // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + System.out.printf("Parsing location '%s'%n", str); + + final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 + final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 + final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$");// matches case 3 + + String contig = null; + long start = 1; + long stop = Integer.MAX_VALUE; + boolean bad = false; + + Matcher match1 = regex1.matcher(str); + Matcher match2 = regex2.matcher(str); + Matcher match3 = regex3.matcher(str); + + try { + if ( match1.matches() ) { + contig = match1.group(1); + } + else if ( match2.matches() ) { + contig = match2.group(1); + start = parsePosition(match2.group(2)); + } + else if ( match3.matches() ) { + contig = match3.group(1); + start = parsePosition(match3.group(2)); + stop = parsePosition(match3.group(3)); + + if ( start > stop ) + bad = true; + } + else { + bad = true; + } + } catch ( Exception e ) { + bad = true; + } + + if ( bad ) { + throw new RuntimeException("Invalid Genome Location string: " + str); + } + + GenomeLoc loc = new GenomeLoc(contig, start, stop); + System.out.printf(" => Parsed location '%s' into %s%n", str, loc); + + return loc; + } + + // + // Accessors and setters + // + public final String getContig() { return this.contig; } + public final long getStart() { return this.start; } + public final long getStop() { return this.stop; } + public final String toString() { + if ( throughEndOfContigP() && atBeginningOfContigP() ) + return getContig(); + else if ( throughEndOfContigP() || getStart() == getStop() ) + return String.format("%s:%d", getContig(), getStart()); + else + return String.format("%s:%d-%d", getContig(), getStart(), getStop()); + } + + + public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } + public final boolean atBeginningOfContigP() { return this.start == 1; } + + public void setContig(String contig) { + this.contig = contig; + } + + public void setStart(long start) { + this.start = start; + } + public void setStop(long stop) { + this.stop = stop; + } + + public final boolean isSingleBP() { return stop == start; } + + public final boolean disjointP(GenomeLoc that) { + if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes + if ( this.start > that.stop ) return true; // this guy is past that + if ( that.start > this.stop ) return true; // that guy is past our start + return false; + } + + public final boolean overlapsP(GenomeLoc that) { + return ! disjointP( that ); + } + + public final boolean onSameContig(GenomeLoc that) { + return this.contig.equals(that.contig); + } + + + // + // Comparison operations + // + public static int compareContigs( final String thisContig, final String thatContig ) { + if ( refContigOrdering != null ) { + if ( ! refContigOrdering.containsKey(thisContig) ) { + if ( ! refContigOrdering.containsKey(thatContig) ) { + // Use regular sorted order + return thisContig.compareTo(thatContig); + } + else { + // this is always bigger if that is in the key set + return 1; + } + } + else if ( ! refContigOrdering.containsKey(thatContig) ) + return -1; + else { + assert refContigOrdering.containsKey(thisContig);// : this; + assert refContigOrdering.containsKey(thatContig);// : that; + + final int thisO = refContigOrdering.get(thisContig); + final int thatO = refContigOrdering.get(thatContig); + if ( thisO < thatO ) return -1; + if ( thisO > thatO ) return 1; + return 0; + } + } + else { + return thisContig.compareTo(thatContig); + } + } + + public int compareContigs( GenomeLoc that ) { + return compareContigs( this.contig, that.contig ); + } + + + public int compareTo( GenomeLoc that ) { + if ( this == that ) return 0; + + final int cmpContig = compareContigs( this.getContig(), that.getContig() ); + if ( cmpContig != 0 ) return cmpContig; + if ( this.getStart() < that.getStart() ) return -1; + if ( this.getStart() > that.getStart() ) return 1; + if ( this.getStop() < that.getStop() ) return -1; + if ( this.getStop() > that.getStop() ) return 1; + return 0; + } +} diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java b/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java index 514d3dd20..c5b28c8f9 100755 --- a/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java @@ -21,7 +21,7 @@ public class ReferenceIterator implements Iterator { private ReferenceSequence currentContig = null; private ReferenceSequence nextContig = null; - private int offset = -1; + private long offset = -1; public ReferenceIterator( ReferenceSequenceFile refFile ) { this.refFile = refFile; @@ -32,11 +32,11 @@ public class ReferenceIterator implements Iterator { // Accessing data // // -------------------------------------------------------------------------------------------------------------- - public byte getBaseAsByte() { return currentContig.getBases()[offset]; } - public String getBaseAsString() { return StringUtil.bytesToString(currentContig.getBases(), offset, 1); } + public byte getBaseAsByte() { return currentContig.getBases()[(int)offset]; } + public String getBaseAsString() { return StringUtil.bytesToString(currentContig.getBases(), (int)offset, 1); } public char getBaseAsChar() { return getBaseAsString().charAt(0); } public ReferenceSequence getCurrentContig() { return currentContig; } - public int getPosition() { return offset + 1; } + public long getPosition() { return offset + 1; } // -------------------------------------------------------------------------------------------------------------- // @@ -91,11 +91,15 @@ public class ReferenceIterator implements Iterator { // Jumping forward // // -------------------------------------------------------------------------------------------------------------- - public ReferenceIterator seekForward(final String contigName, final int pos) { + public ReferenceIterator seekForward(final GenomeLoc loc) { + return seekForwardOffset(loc.getContig(), loc.getStart() - 1); + } + + public ReferenceIterator seekForward(final String contigName, final long pos) { return seekForwardOffset(contigName, pos - 1); } - private ReferenceIterator seekForwardOffset(final String contigName, final int seekOffset) { + private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) { // jumps us forward in the sequence to the contig / pos if ( currentContig == null ) next(); diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java index bd65b6c7d..e5a62d5c4 100644 --- a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java @@ -39,7 +39,7 @@ public class ReferenceOrderedData implements public void testMe() { ReferenceOrderedDatum last = null; for ( ReferenceOrderedDatum rec : this ) { - if ( last == null || ! last.getContig().equals(rec.getContig()) ) { + if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) { System.out.println(rec.toString()); } last = rec; @@ -137,7 +137,11 @@ public class ReferenceOrderedData implements // If we don't find anything and cross beyond contig / pos, we return null // Otherwise we return the first object who's start is at pos // - public ROD seekForward(final String contigName, final int pos) { + public ROD seekForward(final GenomeLoc loc) { + return seekForward(loc.getContig(), loc.getStart()); + } + + protected ROD seekForward(final String contigName, final long pos) { final boolean DEBUG = false; ROD result = null; @@ -146,7 +150,7 @@ public class ReferenceOrderedData implements while ( hasNext() ) { ROD current = next(); //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); - int strCmp = ReferenceOrderedDatum.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() ); + int strCmp = GenomeLoc.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() ); if ( strCmp == 0 ) { // The contigs are equal if ( current.getStart() > pos ) { diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java index 111da8fa5..709fd8b3d 100644 --- a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java @@ -1,11 +1,5 @@ package edu.mit.broad.sting.utils; -import java.util.Comparator; -import java.util.HashMap; - -// -// Ugly global variable defining the optional ordering of contig elements -// /** * Created by IntelliJ IDEA. * User: mdepristo @@ -13,11 +7,7 @@ import java.util.HashMap; * Time: 10:49:47 AM * To change this template use File | Settings | File Templates. */ -public abstract class ReferenceOrderedDatum implements Comparable { - public static HashMap refContigOrdering = null; - public static void setContigOrdering(HashMap rco) { - refContigOrdering = rco; - } +public abstract class ReferenceOrderedDatum implements Comparable { public ReferenceOrderedDatum() { } @@ -27,51 +17,12 @@ public abstract class ReferenceOrderedDatum implements Comparable { public abstract String toSimpleString(); public abstract String repl(); - public abstract String getContig(); - public abstract long getStart(); - public abstract long getStop(); - - public static int compareContigs( final String thisContig, final String thatContig ) { - if ( refContigOrdering != null ) { - if ( ! refContigOrdering.containsKey(thisContig) ) { - if ( ! refContigOrdering.containsKey(thatContig) ) { - // Use regular sorted order - return thisContig.compareTo(thatContig); - } - else { - // this is always bigger if that is in the key set - return 1; - } - } - else if ( ! refContigOrdering.containsKey(thatContig) ) - return -1; - else { - assert refContigOrdering.containsKey(thisContig);// : this; - assert refContigOrdering.containsKey(thatContig);// : that; - - final int thisO = refContigOrdering.get(thisContig); - final int thatO = refContigOrdering.get(thatContig); - if ( thisO < thatO ) return -1; - if ( thisO > thatO ) return 1; - return 0; - } - } - else { - return thisContig.compareTo(thatContig); - } + public abstract GenomeLoc getLocation(); + public int compareTo( ReferenceOrderedDatum that ) { + return getLocation().compareTo(that.getLocation()); } - public int compareTo( Object x ) { - if ( this == x ) return 0; - - ReferenceOrderedDatum that = (ReferenceOrderedDatum)x; - - final int cmpContig = compareContigs( this.getContig(), that.getContig() ); - if ( cmpContig != 0 ) return cmpContig; - if ( this.getStart() < that.getStart() ) return -1; - if ( this.getStart() > that.getStart() ) return 1; - if ( this.getStop() < that.getStop() ) return -1; - if ( this.getStop() > that.getStop() ) return 1; - return 0; - } + public final String getContig() { return getLocation().getContig(); } + public final long getStart() { return getLocation().getStart(); } + public final long getStop() { return getLocation().getStop(); } } diff --git a/java/src/edu/mit/broad/sting/utils/Utils.java b/java/src/edu/mit/broad/sting/utils/Utils.java index bc8c6154a..a9e2150a0 100755 --- a/java/src/edu/mit/broad/sting/utils/Utils.java +++ b/java/src/edu/mit/broad/sting/utils/Utils.java @@ -67,7 +67,11 @@ public class Utils { } return ret.toString(); } - + + public static String join(String separator, Collection strings) { + return join( separator, strings.toArray(new String[0]) ); + } + public static void setupRefContigOrdering(final ReferenceSequenceFile refFile) { List refContigs = refFile.getSequenceDictionary(); HashMap refContigOrdering = new HashMap(); @@ -81,6 +85,6 @@ public class Utils { } System.out.printf("%n Total elements -> %d%n", refContigOrdering.size()); - ReferenceOrderedDatum.setContigOrdering(refContigOrdering); + GenomeLoc.setContigOrdering(refContigOrdering); } } diff --git a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java index eb29a36ca..a2c5f8de4 100644 --- a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java +++ b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java @@ -3,13 +3,13 @@ package edu.mit.broad.sting.utils; import edu.mit.broad.sam.SAMRecord; import edu.mit.broad.sam.util.CloseableIterator; import edu.mit.broad.picard.util.TabbedTextFileParser; +import edu.mit.broad.picard.util.SequenceUtil; import java.io.File; import java.io.InputStream; import java.io.FileInputStream; import java.io.BufferedInputStream; -import java.util.Iterator; -import java.util.HashMap; +import java.util.*; /** * Example format: @@ -22,15 +22,18 @@ import java.util.HashMap; * To change this template use File | Settings | File Templates. */ public class rodDbSNP extends ReferenceOrderedDatum { - public String contig; // Reference sequence chromosome or scaffold - public long start, stop; // Start and stop positions in chrom + public GenomeLoc loc; // genome location of SNP + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom - public String name; // Reference SNP identifier or Affy SNP name - public String strand; // Which DNA strand contains the observed alleles - public String observed; // The sequences of the observed alleles from rs-fasta files - public char[] observedBases; // The sequences of the observed alleles from rs-fasta files - public String molType; // Sample type from exemplar ss - public String varType; // The class of variant (simple, insertion, deletion, range, etc.) + public String name; // Reference SNP identifier or Affy SNP name + public String strand; // Which DNA strand contains the observed alleles + + public String refBases; // the reference base according to NCBI, in the dbSNP file + public String observed; // The sequences of the observed alleles from rs-fasta files + + public String molType; // Sample type from exemplar ss + public String varType; // The class of variant (simple, insertion, deletion, range, etc.) // Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion' public String validationStatus; // The validation status of the SNP // one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap') @@ -53,9 +56,52 @@ public class rodDbSNP extends ReferenceOrderedDatum { // ---------------------------------------------------------------------- public rodDbSNP() {} - public String getContig() { return this.contig; } - public long getStart() { return start; } - public long getStop() { return stop; } + // ---------------------------------------------------------------------- + // + // manipulating the SNP information + // + // ---------------------------------------------------------------------- + public GenomeLoc getLocation() { return loc; } + + public boolean onFwdStrand() { + return strand.equals("+"); + } + + // Get the reference bases on the forward strand + public String getRefBasesFWD() { + if ( onFwdStrand() ) + return refBases; + else + return SequenceUtil.reverseComplement(refBases); + } + + public List getAllelesFWD() { + List alleles = null; + if ( onFwdStrand() ) + alleles = Arrays.asList(observed.split("/")); + else + alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); + + //System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles)); + return alleles; + } + + public String getAllelesFWDString() { + return Utils.join("/", getAllelesFWD()); + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType.contains("single"); } + public boolean isInsertion() { return varType.contains("insertion"); } + public boolean isDeletion() { return varType.contains("deletion"); } + public boolean isIndel() { return varType.contains("in-del"); } + + public boolean isHapmap() { return validationStatus.contains("by-hapmap"); } + public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); } // ---------------------------------------------------------------------- // @@ -63,28 +109,39 @@ public class rodDbSNP extends ReferenceOrderedDatum { // // ---------------------------------------------------------------------- public String toString() { - return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - contig, start, stop, name, strand, observed, molType, + return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", + getContig(), getStart(), getStop(), name, strand, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } public String toSimpleString() { - return String.format("%s:%s", name, observed); + return String.format("%s:%s:%s", name, observed, strand); + } + + public String toMediumString() { + String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString()); + if ( isSNP() ) s += ":SNP"; + if ( isIndel() ) s += ":Indel"; + if ( isHapmap() ) s += ":Hapmap"; + if ( is2Hit2Allele() ) s += ":2Hit"; + return s; } public String repl() { - return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\tX\tX\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - 585, contig, start-1, stop-1, name, strand, observed, molType, + return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", + 585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } public void parseLine(final String[] parts) { try { - contig = parts[1]; - start = Long.parseLong(parts[2]) + 1; // The final is 0 based + String contig = parts[1]; + long start = Long.parseLong(parts[2]) + 1; // The final is 0 based + long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based + loc = new GenomeLoc(contig, start, stop); - stop = Long.parseLong(parts[3]) + 1; // The final is 0 based name = parts[4]; + refBases = parts[5]; strand = parts[6]; observed = parts[9]; molType = parts[10]; @@ -95,15 +152,6 @@ public class rodDbSNP extends ReferenceOrderedDatum { func = parts[15]; locType = parts[16]; weight = Integer.parseInt(parts[17]); - - // Cut up the observed bases string into an array of individual bases - String[] bases = observed.split("/"); - observedBases = new char[bases.length]; - for ( String elt : bases ) { - observedBases[0] = (char)elt.getBytes()[0]; - //System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]); - } - //System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases)); } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); throw e; diff --git a/java/src/edu/mit/broad/sting/utils/rodGFF.java b/java/src/edu/mit/broad/sting/utils/rodGFF.java index ea94f7fa8..bff564ab2 100644 --- a/java/src/edu/mit/broad/sting/utils/rodGFF.java +++ b/java/src/edu/mit/broad/sting/utils/rodGFF.java @@ -24,7 +24,7 @@ public class rodGFF extends ReferenceOrderedDatum { private long start, stop; private double score; private HashMap attributes; - + // ---------------------------------------------------------------------- // // Constructors @@ -53,10 +53,6 @@ public class rodGFF extends ReferenceOrderedDatum { // Accessors // // ---------------------------------------------------------------------- - public String getContig() { - return this.contig; - } - public String getSource() { return source; } @@ -73,18 +69,14 @@ public class rodGFF extends ReferenceOrderedDatum { return frame; } - public long getStart() { - return start; - } - - public long getStop() { - return stop; - } - public double getScore() { return score; } + public GenomeLoc getLocation() { + return new GenomeLoc(contig, start, stop); + } + public String getAttribute(final String key) { return attributes.get(key); } diff --git a/scripts/TraverseTest.sh b/scripts/TraverseTest.sh index 6dde72393..d39c05437 100755 --- a/scripts/TraverseTest.sh +++ b/scripts/TraverseTest.sh @@ -1 +1 @@ -java -Xmx1024m -cp out/production/AnalysisTK:../../jars/broad.jar edu.mit.broad.sting.atk.AnalysisTK $* +java -Xmx1024m -cp out/production/AnalysisTK:trunk/java/jars/functionalj.jar edu.mit.broad.sting.atk.AnalysisTK $* diff --git a/scripts/TraverseTestProf.sh b/scripts/TraverseTestProf.sh index 91cf7fab5..c542c8337 100755 --- a/scripts/TraverseTestProf.sh +++ b/scripts/TraverseTestProf.sh @@ -1 +1 @@ -java -Xmx1024m -agentlib:hprof=cpu=samples -cp out/production/AnalysisTK:../../jars/broad.jar edu.mit.broad.sting.atk.AnalysisTK $* +java -Xmx1024m -agentlib:hprof=cpu=samples -cp out/production/AnalysisTK:trunk/java/jars/functionalj.jar edu.mit.broad.sting.atk.AnalysisTK $*