diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 194270c7b..8b026b4ae 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseByReads; import org.broadinstitute.sting.gatk.traversals.TraverseByLoci; +import org.broadinstitute.sting.gatk.traversals.TraverseByLociByReference; import org.broadinstitute.sting.utils.FastaSequenceFile2; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -187,7 +188,10 @@ public class GenomeAnalysisTK extends CommandLineProgram { // Try to get the walker specified try { LocusWalker walker = (LocusWalker) my_walker; - this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); + if ( WALK_ALL_LOCI ) + this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); + else + this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); } catch (java.lang.ClassCastException e) { // I guess we're a read walker LOL @@ -198,16 +202,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { // Prepare the sort ordering w.r.t. the sequence dictionary if (REF_FILE_ARG != null) { final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); - List refContigs = refFile.getSequenceDictionary().getSequences(); - - HashMap refContigOrdering = new HashMap(); - int i = 0; - for (SAMSequenceRecord contig : refContigs) { - refContigOrdering.put(contig.getSequenceName(), i); - i++; - } - - GenomeLoc.setContigOrdering(refContigOrdering); + GenomeLoc.setupRefContigOrdering(refFile); } ValidationStringency strictness; if (STRICTNESS_ARG == null) { @@ -232,6 +227,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { if (INTERVALS_FILE != null) { engine.setLocationFromFile(INTERVALS_FILE); } + engine.setSafetyChecking(!UNSAFE); engine.setSortOnFly(ENABLED_SORT_ON_FLY); engine.setThreadedIO(ENABLED_THREADED_IO); @@ -277,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { */ private static void testNewReferenceFeatures(final File refFileName) { final FastaSequenceFile2 refFile = new FastaSequenceFile2(refFileName); - Utils.setupRefContigOrdering(refFile); + GenomeLoc.setupRefContigOrdering(refFile); List refContigs = refFile.getSequenceDictionary().getSequences(); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java index 75432f718..1fb631d8c 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java @@ -24,14 +24,14 @@ public class SamQueryIterator implements Iterator { Iterator locIter = null; CloseableIterator recordIter = null; - public SamQueryIterator( SAMFileReader reader, GenomeLoc[] locs ) { + public SamQueryIterator( SAMFileReader reader, ArrayList locs ) { this.reader = reader; // Our internal contract for the class guarantees that locIter and recordIter are never null. // Initialize them and seed them with empty data as necessary. if(locs != null) { // The user requested a specific set of locations, set up the iterators accordly. - locIter = Arrays.asList(locs).iterator(); + locIter = locs.iterator(); recordIter = new NullCloseableIterator(); } else { diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index eb14050cf..44cb226ef 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -93,7 +93,7 @@ public abstract class TraversalEngine { // Locations we are going to process during the traversal - protected GenomeLoc[] locs = null; + private ArrayList locs = null; // -------------------------------------------------------------------------------------------------------------- // @@ -175,7 +175,7 @@ public abstract class TraversalEngine { * @param locStr */ public void setLocation(final String locStr) { - this.locs = parseGenomeLocs(locStr); + this.locs = GenomeLoc.parseGenomeLocs(locStr); } /** @@ -194,77 +194,19 @@ public abstract class TraversalEngine { reader.close(); String locStr = Utils.join(";", lines); logger.debug("locStr: " + locStr); - this.locs = parseGenomeLocs(locStr); + setLocation(locStr); } catch (Exception e) { e.printStackTrace(); System.exit(-1); } } - /** - * Useful utility function that parses a location string into a coordinate-order sorted - * array of GenomeLoc objects - * - * @param str - * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order - */ - public static GenomeLoc[] parseGenomeLocs(final String str) { - // Of the form: loc1;loc2;... - // Where each locN can be: - // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ - StdReflect reflect = new JdkStdReflect(); - FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); - Function1 f1 = parseOne.f1(); - try { - Collection result = Functions.map(f1, Arrays.asList(str.split(";"))); - GenomeLoc[] locs = (GenomeLoc[]) result.toArray(new GenomeLoc[0]); - Arrays.sort(locs); - logger.info(String.format("Going to process %d locations", locs.length)); - //System.out.println(" Locations are: " + Utils.join("\n", Functions.map(Operators.toString, Arrays.asList(locs)))); - return locs; - } catch (Exception e) { - logger.fatal(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str)); - throw new IllegalArgumentException("Invalid locations string: " + str + ", format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'"); - } - } - /** - * A key function that returns true if the proposed GenomeLoc curr is within the list of - * locations we are processing in this TraversalEngine - * - * @param curr - * @return true if we should process GenomeLoc curr, otherwise false - */ - public boolean inLocations(GenomeLoc curr) { - if (this.locs == null) { - return true; - } else { - for ( GenomeLoc loc : this.locs ) { - //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); - if (loc.overlapsP(curr)) - return true; - } - return false; - } - } public boolean hasLocations() { return this.locs != null; } - /** - * Returns true iff we have a specified series of locations to process AND we are past the last - * location in the list. It means that, in a serial processing of the genome, that we are done. - * - * @param curr Current genome Location - * @return true if we are past the last location to process - */ - protected boolean pastFinalLocation(GenomeLoc curr) { - boolean r = locs != null && locs[locs.length - 1].compareTo(curr) == -1 && !locs[locs.length - 1].overlapsP(curr); - //System.out.printf(" pastFinalLocation %s vs. %s => %d => %b%n", locs[locs.length-1], curr, locs[locs.length-1].compareTo( curr ), r); - return r; - } - // -------------------------------------------------------------------------------------------------------------- // // printing @@ -433,11 +375,11 @@ public abstract class TraversalEngine { //this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName); this.refFile = new FastaSequenceFile2(refFileName); // todo: replace when FastaSequenceFile2 is in picard this.refIter = new ReferenceIterator(this.refFile); - if (!Utils.setupRefContigOrdering(this.refFile)) { - // We couldn't process the reference contig ordering, fail since we need it - logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName)); - throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down."); - } +// if (!GenomeLoc.setupRefContigOrdering(this.refFile)) { +// // We couldn't process the reference contig ordering, fail since we need it +// logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName)); +// throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down."); +// } } } @@ -510,14 +452,14 @@ public abstract class TraversalEngine { // -------------------------------------------------------------------------------------------------------------- public T traverse(Walker walker) { - List l = new ArrayList(); + ArrayList l = new ArrayList(); if ( hasLocations() ) - l = Arrays.asList(locs); + l = locs; return traverse(walker, l); } - public T traverse(Walker walker, List locations) { + public T traverse(Walker walker, ArrayList locations) { return null; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index 2f52c2eef..99e5c5d45 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -15,6 +15,7 @@ import org.broadinstitute.sting.utils.Utils; import java.util.List; import java.util.Arrays; import java.util.Iterator; +import java.util.ArrayList; import java.io.File; import net.sf.samtools.SAMRecord; @@ -34,7 +35,7 @@ public class TraverseByLoci extends TraversalEngine { super(reads, ref, rods); } - public T traverse(Walker walker, List locations) { + public T traverse(Walker walker, ArrayList locations) { if ( walker instanceof LocusWalker ) { Walker x = walker; LocusWalker locusWalker = (LocusWalker)x; @@ -54,7 +55,9 @@ public class TraverseByLoci extends TraversalEngine { * @param ReduceType -- the result of calling reduce() on the walker * @return 0 on success */ - protected T traverseByLoci(LocusWalker walker, List locations) { + protected T traverseByLoci(LocusWalker walker, ArrayList locations) { + logger.debug("Entering traverseByLoci"); + samReader = initializeSAMFile(readsFile); verifySortOrder(true); @@ -63,9 +66,14 @@ public class TraverseByLoci extends TraversalEngine { walker.initialize(); T sum = walker.reduceInit(); - if ( samReader.hasIndex() && hasLocations() ) { + if ( ! locations.isEmpty() ) { + logger.debug("Doing interval-based traversal"); + + if ( ! samReader.hasIndex() ) + Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting."); + // we are doing interval-based traversals - for ( GenomeLoc interval : locs ) { + for ( GenomeLoc interval : locations ) { logger.debug(String.format("Processing locus %s", interval.toString())); CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), @@ -79,6 +87,7 @@ public class TraverseByLoci extends TraversalEngine { } else { // We aren't locus oriented + logger.debug("Doing non-interval-based traversal"); samReadIter = WrapReadsIterator(getReadsIterator(samReader), true); sum = carryWalkerOverInterval(walker, samReadIter, sum, null); } @@ -89,6 +98,8 @@ public class TraverseByLoci extends TraversalEngine { } protected T carryWalkerOverInterval( LocusWalker walker, Iterator readIter, T sum, GenomeLoc interval ) { + logger.debug(String.format("TraverseByLoci.carryWalkerOverInterval Genomic interval is %s", interval)); + // prepare the read filtering read iterator and provide it to a new locus iterator FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc()); @@ -102,38 +113,49 @@ public class TraverseByLoci extends TraversalEngine { // if we don't have a particular interval we're processing, check them all, otherwise only operate at this // location - if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) { - - //System.out.format("Working at %s\n", locus.getLocation().toString()); - + if ( interval == null || interval.overlapsP(locus.getLocation()) ) { ReferenceIterator refSite = refIter.seekForward(locus.getLocation()); - final char refBase = refSite.getBaseAsChar(); locus.setReferenceContig(refSite.getCurrentContig()); // Iterate forward to get all reference ordered data covering this locus final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation()); - logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); + walkAtLocus( walker, sum, locus, refSite, rodData ); - // - // Execute our contract with the walker. Call filter, map, and reduce - // - final boolean keepMeP = walker.filter(rodData, refBase, locus); - if (keepMeP) { - M x = walker.map(rodData, refBase, locus); - sum = walker.reduce(x, sum); - } + //System.out.format("Working at %s\n", locus.getLocation().toString()); if (this.maxReads > 0 && this.nRecords > this.maxReads) { logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); done = true; } - printProgress("loci", locus.getLocation()); - if (pastFinalLocation(locus.getLocation())) - done = true; } + + done = interval != null && locus.getLocation().isPast(interval); + //System.out.printf("done is %b, %s vs. %s%n", done, interval, locus.getLocation()); } return sum; } + + protected T walkAtLocus( final LocusWalker walker, + T sum, + final LocusContext locus, + final ReferenceIterator refSite, + final List rodData ) { + final char refBase = refSite.getBaseAsChar(); + + //logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); + + // + // Execute our contract with the walker. Call filter, map, and reduce + // + final boolean keepMeP = walker.filter(rodData, refBase, locus); + if (keepMeP) { + M x = walker.map(rodData, refBase, locus); + sum = walker.reduce(x, sum); + } + + printProgress("loci", locus.getLocation()); + return sum; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java new file mode 100644 index 000000000..f592165f0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + +import java.util.List; +import java.util.Arrays; +import java.util.Iterator; +import java.util.ArrayList; +import java.io.File; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import edu.mit.broad.picard.filter.FilteringIterator; + +/** + * A simple, short-term solution to iterating over all reference positions over a series of + * genomic locations. Simply overloads the superclass traverse function to go over the entire + * interval's reference positions. + */ +public class TraverseByLociByReference extends TraverseByLoci { + + public TraverseByLociByReference(File reads, File ref, List rods) { + super(reads, ref, rods); + } + + public T traverse(Walker walker, ArrayList locations) { + if ( locations.isEmpty() ) + Utils.scareUser("Requested all locations be processed without providing locations to be processed!"); + + return super.traverse(walker, locations); + } + + protected T carryWalkerOverInterval( LocusWalker walker, + Iterator readIter, + T sum, + GenomeLoc interval ) { + logger.debug(String.format("TraverseByLociByReference.carryWalkerOverInterval Genomic interval is %s", interval)); + + boolean done = false; + + List NO_READS = new ArrayList(); + List NO_OFFSETS = new ArrayList(); + + FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc()); + LocusIterator locusIter = new LocusIteratorByHanger(filterIter); // prepare the iterator by loci from reads + ReferenceIterator refSite = refIter.seekForward(interval); // jump to the first reference site + LocusContext locusFromReads = advanceReadsToLoc(locusIter, interval); // load up the next locus by reads + + // We keep processing while the next reference location is within the interval + while ( interval.containsP(refSite.getLocation()) && ! done ) { + logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation())); + + this.nRecords++; + GenomeLoc current = refSite.getLocation(); + + // Iterate forward to get all reference ordered data covering this locus + final List rodData = getReferenceOrderedDataAtLocus(rodIters, current); + + LocusContext locus = null; + + if ( locusFromReads != null && locusFromReads.getLocation().compareTo(current) == 0 ) { // we are at the same site + locus = locusFromReads; + if ( locusIter.hasNext() ) + locusFromReads = locusIter.next(); // advance the iterator + } + else + locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads + + locus.setReferenceContig(refSite.getCurrentContig()); + sum = walkAtLocus( walker, sum, locus, refSite, rodData ); + + if (this.maxReads > 0 && this.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + done = true; + } + + refSite = refIter.next(); // update our location + } + + return sum; + } + + + private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) { + if ( ! locusIter.hasNext() ) + return null; + + LocusContext locus = locusIter.next(); + + while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) { + logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target))); + locus = locusIter.next(); + } + + logger.debug(String.format(" returning %s", locus.getLocation())); + return locus; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index b9297670c..0d70379d7 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.Utils; import java.util.List; import java.util.Arrays; +import java.util.ArrayList; import java.io.File; import net.sf.samtools.SAMRecord; @@ -28,7 +29,7 @@ public class TraverseByReads extends TraversalEngine { super(reads, ref, rods); } - public T traverse(Walker walker, List locations) { + public T traverse(Walker walker, ArrayList locations) { if ( walker instanceof ReadWalker ) { Walker x = walker; ReadWalker readWalker = (ReadWalker)x; @@ -46,10 +47,10 @@ public class TraverseByReads extends TraversalEngine { * * @param walker A read walker object * @param MapType -- the result of calling map() on walker - * @param ReduceType -- the result of calling reduce() on the walker + * @param <> ReduceType -- the result of calling reduce() on the walker * @return 0 on success */ - public Object traverseByRead(ReadWalker walker, List locations) { + public Object traverseByRead(ReadWalker walker, ArrayList locations) { samReadIter = initializeReads(); if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) { @@ -83,7 +84,7 @@ public class TraverseByReads extends TraversalEngine { locus.setReferenceContig(refSite.getCurrentContig()); } - if (inLocations(loc)) { + if (GenomeLoc.inLocations(loc, locations)) { // // execute the walker contact @@ -101,7 +102,7 @@ public class TraverseByReads extends TraversalEngine { } printProgress("reads", loc); - if (pastFinalLocation(loc)) + if (GenomeLoc.pastFinalLocation(loc, locations)) done = true; //System.out.printf("Done? %b%n", done); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 4e3aac348..a6063e0d1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.utils.cmdLine.Argument; import net.sf.samtools.SAMRecord; import java.util.List; @@ -15,6 +16,8 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class PileupWalker extends LocusWalker { + public boolean FLAG_UNCOVERED_BASES = true; // todo: how do I make this a command line argument? + public void initialize() { } @@ -37,6 +40,10 @@ public class PileupWalker extends LocusWalker { quals += read.getBaseQualityString().charAt(offset); } + if ( bases.equals("") && FLAG_UNCOVERED_BASES ) { + bases = "*** UNCOVERED SITE ***"; + } + String rodString = ""; for ( ReferenceOrderedDatum datum : rodData ) { if ( datum != null ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java index 338aed7b1..77774acc1 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java @@ -47,16 +47,7 @@ public class PrepareROD extends CommandLineProgram { // Prepare the sort ordering w.r.t. the sequence dictionary final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); - List refContigs = refFile.getSequenceDictionary().getSequences(); - HashMap refContigOrdering = new HashMap(); - - int i = 0; - for ( SAMSequenceRecord contig : refContigs ) { - System.out.println(contig.getSequenceName()); - refContigOrdering.put(contig.getSequenceName(), i); - i++; - } - GenomeLoc.setContigOrdering(refContigOrdering); + GenomeLoc.setupRefContigOrdering(refFile); Class rodClass = Types.get(ROD_TYPE.toLowerCase()); diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java index 7b7184ce3..3699385e5 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java @@ -259,19 +259,7 @@ public class IndelInspector extends CommandLineProgram { setDefaultContigOrdering(); return; } - List seqs = h.getSequenceDictionary().getSequences(); - if ( seqs == null ) { - System.out.println("No reference sequence records found in SAM file header, " + - "falling back to default contig ordering"); - setDefaultContigOrdering(); - return; - } - int i = 0; - Map rco = new HashMap(); - for ( SAMSequenceRecord sr : seqs) { - rco.put(sr.getSequenceName(),i++); - } - GenomeLoc.setContigOrdering(rco); + GenomeLoc.setupRefContigOrdering(h.getSequenceDictionary()); } private void setDefaultContigOrdering() { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 37ea726f5..521ca0cf2 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -1,9 +1,21 @@ package org.broadinstitute.sting.utils; +import net.sf.functionalj.reflect.StdReflect; +import net.sf.functionalj.reflect.JdkStdReflect; +import net.sf.functionalj.FunctionN; +import net.sf.functionalj.Function1; +import net.sf.functionalj.Functions; +import net.sf.functionalj.util.Operators; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import org.apache.log4j.Logger; + /** * Created by IntelliJ IDEA. * User: mdepristo @@ -15,23 +27,90 @@ import java.util.regex.Matcher; * */ public class GenomeLoc implements Comparable { + private static Logger logger = Logger.getLogger(GenomeLoc.class); + private String contig; private long start; private long stop; + // -------------------------------------------------------------------------------------------------------------- // // Ugly global variable defining the optional ordering of contig elements // - public static Map refContigOrdering = null; - public static HashMap interns = null; + // -------------------------------------------------------------------------------------------------------------- + //public static Map refContigOrdering = null; + private static SAMSequenceDictionary contigInfo = null; + private static HashMap interns = null; + public static boolean hasKnownContigOrdering() { + return contigInfo != null; + } + + public static SAMSequenceRecord getContigInfo( final String contig ) { + return contigInfo.getSequence(contig); + } + + public static int getContigIndex( final String contig ) { + return contigInfo.getSequenceIndex(contig); + } + + /* public static void setContigOrdering(Map rco) { refContigOrdering = rco; interns = new HashMap(); for ( String contig : rco.keySet() ) interns.put( contig, contig ); } + */ + /* + public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) { + final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary(); + + if (seqDict == null) // we couldn't load the reference dictionary + return false; + + List refContigs = seqDict.getSequences(); + HashMap refContigOrdering = new HashMap(); + + if (refContigs != null) { + int i = 0; + logger.info(String.format("Prepared reference sequence contig dictionary%n order ->")); + for (SAMSequenceRecord contig : refContigs) { + logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + refContigOrdering.put(contig.getSequenceName(), i); + i++; + } + } + + setContigOrdering(refContigOrdering); + return refContigs != null; + } + */ + + public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) { + return setupRefContigOrdering(refFile.getSequenceDictionary()); + } + + public static boolean setupRefContigOrdering(final SAMSequenceDictionary seqDict) { + if (seqDict == null) // we couldn't load the reference dictionary + return false; + else { + contigInfo = seqDict; + logger.info(String.format("Prepared reference sequence contig dictionary%n order ->")); + for (SAMSequenceRecord contig : seqDict.getSequences() ) { + logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + } + } + + return true; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // constructors + // + // -------------------------------------------------------------------------------------------------------------- public GenomeLoc( String contig, final long start, final long stop ) { if ( interns != null ) contig = interns.get(contig); @@ -49,9 +128,11 @@ public class GenomeLoc implements Comparable { this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() ); } + // -------------------------------------------------------------------------------------------------------------- // // Parsing string representations // + // -------------------------------------------------------------------------------------------------------------- private static long parsePosition( final String pos ) { String x = pos.replaceAll(",", ""); return Long.parseLong(x); @@ -101,12 +182,104 @@ public class GenomeLoc implements Comparable { throw new RuntimeException("Invalid Genome Location string: " + str); } + if ( stop == Integer.MAX_VALUE && hasKnownContigOrdering() ) { + // lookup the actually stop position! + stop = getContigInfo(contig).getSequenceLength(); + } + GenomeLoc loc = new GenomeLoc(contig, start, stop); //System.out.printf(" => Parsed location '%s' into %s%n", str, loc); return loc; } + /** + * Useful utility function that parses a location string into a coordinate-order sorted + * array of GenomeLoc objects + * + * @param str + * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order + */ + public static ArrayList parseGenomeLocs(final String str) { + // Of the form: loc1;loc2;... + // Where each locN can be: + // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + StdReflect reflect = new JdkStdReflect(); + FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); + Function1 f1 = parseOne.f1(); + try { + Collection result = Functions.map(f1, Arrays.asList(str.split(";"))); + ArrayList locs = new ArrayList(result); + Collections.sort(locs); + //logger.info(String.format("Going to process %d locations", locs.length)); + locs = mergeOverlappingLocations(locs); + logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs))); + return locs; + } catch (Exception e) { + e.printStackTrace(); + Utils.scareUser(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str)); + return null; + } + } + + public static ArrayList mergeOverlappingLocations(final ArrayList raw) { + logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw))); + if ( raw.size() <= 1 ) + return raw; + else { + ArrayList merged = new ArrayList(); + Iterator it = raw.iterator(); + GenomeLoc prev = it.next(); + while ( it.hasNext() ) { + GenomeLoc curr = it.next(); + if ( prev.contiguousP(curr) ) { + prev = prev.merge(curr); + } else { + merged.add(prev); + prev = curr; + } + } + merged.add(prev); + return merged; + } + } + + /** + * Returns true iff we have a specified series of locations to process AND we are past the last + * location in the list. It means that, in a serial processing of the genome, that we are done. + * + * @param curr Current genome Location + * @return true if we are past the last location to process + */ + public static boolean pastFinalLocation(GenomeLoc curr, ArrayList locs) { + if ( locs == null ) + return false; + else { + GenomeLoc last = locs.get(locs.size() - 1); + return last.compareTo(curr) == -1 && ! last.overlapsP(curr); + } + } + + /** + * A key function that returns true if the proposed GenomeLoc curr is within the list of + * locations we are processing in this TraversalEngine + * + * @param curr + * @return true if we should process GenomeLoc curr, otherwise false + */ + public static boolean inLocations(GenomeLoc curr, ArrayList locs) { + if (locs == null) { + return true; + } else { + for ( GenomeLoc loc : locs ) { + //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); + if (loc.overlapsP(curr)) + return true; + } + return false; + } + } + // // Accessors and setters // @@ -147,10 +320,34 @@ public class GenomeLoc implements Comparable { return false; } + public final boolean discontinuousP(GenomeLoc that) { + if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes + if ( (this.start - 1) > that.stop ) return true; // this guy is past that + if ( (that.start - 1) > this.stop ) return true; // that guy is past our start + return false; + } + public final boolean overlapsP(GenomeLoc that) { return ! disjointP( that ); } + public final boolean contiguousP(GenomeLoc that) { + return ! discontinuousP( that ); + } + + public GenomeLoc merge( GenomeLoc that ) { + assert this.contiguousP(that); + + return new GenomeLoc(getContig(), + Math.min(getStart(), that.getStart()), + Math.max( getStop(), that.getStop()) ); + } + + public final boolean containsP(GenomeLoc that) { + if ( ! onSameContig(that) ) return false; + return getStart() <= that.getStart() && getStop() >= that.getStop(); + } + public final boolean onSameContig(GenomeLoc that) { return this.contig.equals(that.contig); } @@ -170,6 +367,10 @@ public class GenomeLoc implements Comparable { return this.compareTo(left) > -1 && this.compareTo(right) < 1; } + public final boolean isPast( GenomeLoc that ) { + return this.compareContigs(that) == 1 || this.getStart() > that.getStop(); + } + public final void incPos() { incPos(1); } @@ -194,14 +395,17 @@ public class GenomeLoc implements Comparable { return 0; } - assert refContigOrdering.containsKey(thisContig);// : this; - assert refContigOrdering.containsKey(thatContig);// : that; + //assert getContigIndex(thisContig) != -1;// : this; + //assert getContigIndex(thatContig) != -1;// : that; - if ( refContigOrdering != null ) + if ( hasKnownContigOrdering() ) { - if ( ! refContigOrdering.containsKey(thisContig) ) + int thisIndex = getContigIndex(thisContig); + int thatIndex = getContigIndex(thatContig); + + if ( thisIndex == -1 ) { - if ( ! refContigOrdering.containsKey(thatContig) ) + if ( thatIndex == -1 ) { // Use regular sorted order return thisContig.compareTo(thatContig); @@ -212,20 +416,14 @@ public class GenomeLoc implements Comparable { return 1; } } - else if ( ! refContigOrdering.containsKey(thatContig) ) + else if ( thatIndex == -1 ) { return -1; } else { - assert refContigOrdering.containsKey(thisContig);// : this; - assert refContigOrdering.containsKey(thatContig);// : that; - - final int thisO = refContigOrdering.get(thisContig); - final int thatO = refContigOrdering.get(thatContig); - - if ( thisO < thatO ) return -1; - if ( thisO > thatO ) return 1; + if ( thisIndex < thatIndex ) return -1; + if ( thisIndex > thatIndex ) return 1; return 0; } } diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 40ab6a3b3..65eaa45c1 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -27,19 +27,19 @@ public class Utils { private static Logger logger = Logger.getLogger(FileProgressTracker.class); public static void warnUser(final String msg) { - logger.warn(String.format("********************************************************************************%n")); - logger.warn(String.format("* WARNING:%n")); - logger.warn(String.format("*%n")); - logger.warn(String.format("* %s%n", msg)); - logger.warn(String.format("********************************************************************************%n")); + logger.warn(String.format("********************************************************************************")); + logger.warn(String.format("* WARNING:")); + logger.warn(String.format("*")); + logger.warn(String.format("* %s", msg)); + logger.warn(String.format("********************************************************************************")); } public static void scareUser(final String msg) { - logger.fatal(String.format("********************************************************************************%n")); - logger.fatal(String.format("* ERROR:%n")); - logger.fatal(String.format("*%n")); - logger.fatal(String.format("* %s%n", msg)); - logger.fatal(String.format("********************************************************************************%n")); + logger.fatal(String.format("********************************************************************************")); + logger.fatal(String.format("* ERROR:")); + logger.fatal(String.format("*")); + logger.fatal(String.format("* %s", msg)); + logger.fatal(String.format("********************************************************************************")); throw new RuntimeException(msg); } @@ -232,30 +232,6 @@ public class Utils { return averageDouble(vals, vals.size()); } - public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) { - final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary(); - - if (seqDict == null) // we couldn't load the reference dictionary - return false; - - List refContigs = seqDict.getSequences(); - HashMap refContigOrdering = new HashMap(); - - if (refContigs != null) { - int i = 0; - logger.info(String.format("Prepared reference sequence contig dictionary%n order ->")); - for (SAMSequenceRecord contig : refContigs) { - logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); - refContigOrdering.put(contig.getSequenceName(), i); - i++; - } - logger.info(String.format("%n Total elements -> %d%n", refContigOrdering.size())); - } - - GenomeLoc.setContigOrdering(refContigOrdering); - return refContigs != null; - } - // Java Generics can't do primitive types, so I had to do this the simplistic way public static Integer[] SortPermutation(final int[] A) {