package org.broadinstitute.sting.utils; import edu.mit.broad.picard.directed.IntervalList; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import edu.mit.broad.picard.util.Interval; import net.sf.functionalj.Function1; import net.sf.functionalj.FunctionN; import net.sf.functionalj.Functions; import net.sf.functionalj.reflect.JdkStdReflect; import net.sf.functionalj.reflect.StdReflect; import net.sf.functionalj.util.Operators; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import java.io.File; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by IntelliJ IDEA. * User: mdepristo * Date: Mar 2, 2009 * Time: 8:50:11 AM * * Genome location representation. It is *** 1 *** based * * */ public class GenomeLoc implements Comparable { private static Logger logger = Logger.getLogger(GenomeLoc.class); private Integer contigIndex; private long start; private long stop; // -------------------------------------------------------------------------------------------------------------- // // Ugly global variable defining the optional ordering of contig elements // // -------------------------------------------------------------------------------------------------------------- //public static Map refContigOrdering = null; private static SAMSequenceDictionary contigInfo = null; public static boolean hasKnownContigOrdering() { return contigInfo != null; } public static SAMSequenceRecord getContigInfo( final String contig ) { return contigInfo.getSequence(contig); } /** * Returns the contig index of a specified string version of the contig * @param contig the contig string * @return the contig index, -1 if not found */ public static int getContigIndex( final String contig ) { if (contigInfo.getSequenceIndex(contig) == -1) Utils.scareUser(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return contigInfo.getSequenceIndex(contig); } public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) { return setupRefContigOrdering(refFile.getSequenceDictionary()); } public static boolean setupRefContigOrdering(final SAMSequenceDictionary seqDict) { if (seqDict == null) { // we couldn't load the reference dictionary logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); Utils.scareUser("Failed to load reference dictionary"); return false; } else if ( contigInfo == null ){ contigInfo = seqDict; logger.debug(String.format("Prepared reference sequence contig dictionary")); for (SAMSequenceRecord contig : seqDict.getSequences() ) { logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); } } return true; } // -------------------------------------------------------------------------------------------------------------- // // constructors // // -------------------------------------------------------------------------------------------------------------- public GenomeLoc( int contigIndex, final long start, final long stop ) { if(contigInfo == null) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); } if (contigIndex < 0 || contigIndex >= contigInfo.size()) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); } if (start < 0) { throw new StingException("Bad start position " + start);} if (stop < -1) { throw new StingException("Bad stop position " + stop); } // a negative -1 indicates it's not a meaningful end position this.contigIndex = contigIndex; this.start = start; this.stop = stop == -1 ? start : stop; } public GenomeLoc(final SAMRecord read) { this(read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd()); } public GenomeLoc( final String contig, final long start, final long stop ) { this(contigInfo.getSequenceIndex(contig), start, stop); } public GenomeLoc( final String contig, final long pos ) { this(contig, pos, pos); } public GenomeLoc( final int contig, final long pos ) { this(contig, pos, pos ); } public GenomeLoc( final GenomeLoc toCopy ) { this( toCopy.contigIndex, toCopy.getStart(), toCopy.getStop() ); } // -------------------------------------------------------------------------------------------------------------- // // Parsing string representations // // -------------------------------------------------------------------------------------------------------------- private static long parsePosition( final String pos ) { String x = pos.replaceAll(",", ""); return Long.parseLong(x); } public static GenomeLoc parseGenomeLoc( final String str ) { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3 final Pattern regex4 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)\\+"); // matches case 4 String contig = null; long start = 1; long stop = Integer.MAX_VALUE; boolean bad = false; Matcher match1 = regex1.matcher(str); Matcher match2 = regex2.matcher(str); Matcher match3 = regex3.matcher(str); Matcher match4 = regex4.matcher(str); try { if ( match1.matches() ) { contig = match1.group(1); } else if ( match2.matches() ) { contig = match2.group(1); start = parsePosition(match2.group(2)); stop = start; } else if ( match4.matches() ) { contig = match4.group(1); start = parsePosition(match4.group(2)); } else if ( match3.matches() ) { contig = match3.group(1); start = parsePosition(match3.group(2)); stop = parsePosition(match3.group(3)); if ( start > stop ) bad = true; } else { bad = true; } } catch ( Exception e ) { bad = true; } if ( bad ) { throw new StingException("Invalid Genome Location string: " + str); } if ( stop == Integer.MAX_VALUE && hasKnownContigOrdering() ) { // lookup the actually stop position! stop = getContigInfo(contig).getSequenceLength(); } GenomeLoc loc = new GenomeLoc(contig, start, stop); // System.out.printf(" => Parsed location '%s' into %s%n", str, loc); return loc; } /** * Useful utility function that parses a location string into a coordinate-order sorted * array of GenomeLoc objects * * @param str String representation of genome locs. Null string corresponds to no filter. * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order */ public static ArrayList parseGenomeLocs(final String str) { // Null string means no filter. if( str == null ) return null; // Of the form: loc1;loc2;... // Where each locN can be: // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' StdReflect reflect = new JdkStdReflect(); FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); Function1 f1 = parseOne.f1(); try { Collection result = Functions.map(f1, Arrays.asList(str.split(";"))); ArrayList locs = new ArrayList(result); Collections.sort(locs); //logger.info(String.format("Going to process %d locations", locs.length)); locs = mergeOverlappingLocations(locs); logger.info("Locations are:" + Utils.join(", ", Functions.map(Operators.toString, locs))); return locs; } catch (Exception e) { e.printStackTrace(); Utils.scareUser(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str)); return null; } } public static ArrayList mergeOverlappingLocations(final ArrayList raw) { logger.debug(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw))); if ( raw.size() <= 1 ) return raw; else { ArrayList merged = new ArrayList(); Iterator it = raw.iterator(); GenomeLoc prev = it.next(); while ( it.hasNext() ) { GenomeLoc curr = it.next(); if ( prev.contiguousP(curr) ) { prev = prev.merge(curr); } else { merged.add(prev); prev = curr; } } merged.add(prev); return merged; } } /** * Move this Genome loc to the next contig, with a start * and stop of 1. * @return true if we are not out of contigs, otherwise false if we're * at the end of the genome (no more contigs to jump to). */ public boolean toNextContig() { if ((contigIndex + 1) < GenomeLoc.contigInfo.size()) { this.contigIndex++; this.start = 1; this.stop = 1; return true; } return false; } /** * Returns true iff we have a specified series of locations to process AND we are past the last * location in the list. It means that, in a serial processing of the genome, that we are done. * * @param curr Current genome Location * @return true if we are past the last location to process */ public static boolean pastFinalLocation(GenomeLoc curr, ArrayList locs) { return (locs.size() > 0 && curr.isPast(locs.get(locs.size() - 1))); } /** * A key function that returns true if the proposed GenomeLoc curr is within the list of * locations we are processing in this TraversalEngine * * @param curr * @return true if we should process GenomeLoc curr, otherwise false */ public static boolean inLocations(GenomeLoc curr, ArrayList locs) { if ( locs.size() == 0 ) { return true; } else { for ( GenomeLoc loc : locs ) { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); if (loc.overlapsP(curr)) return true; } return false; } } public static void removePastLocs(GenomeLoc curr, List locs) { while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) { //System.out.println("At: " + curr + ", removing: " + locs.get(0)); locs.remove(0); } } public static boolean overlapswithSortedLocsP(GenomeLoc curr, List locs, boolean returnTrueIfEmpty) { if ( locs.isEmpty() ) return returnTrueIfEmpty; // skip loci before intervals begin if ( hasKnownContigOrdering() && curr.contigIndex < locs.get(0).contigIndex ) return false; for ( GenomeLoc loc : locs ) { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); if ( loc.overlapsP(curr) ) return true; if ( curr.compareTo(loc) < 0 ) return false; } return false; } // // Accessors and setters // public final String getContig() { //this.contigIndex != -1; if (!(contigInfo != null && contigInfo.getSequences() != null)) { throw new StingException("The contig information or it's sequences are null"); } if ((this.contigIndex < 0) || (this.contigIndex >= contigInfo.getSequences().size())) { throw new StingException("The contig index is not bounded by the zero and seqeunce count, contig index: " + contigIndex); } if (contigInfo.getSequence(this.contigIndex) == null || contigInfo.getSequence(this.contigIndex).getSequenceName() == null) { throw new StingException("The associated sequence index for contig " + contigIndex + " is null"); } return contigInfo.getSequence(this.contigIndex).getSequenceName(); //if (contigInfo != null && contigInfo.getSequence(this.contigIndex) != null) { // return contigInfo.getSequence(this.contigIndex).getSequenceName(); //} //return null; } public final int getContigIndex() { return this.contigIndex; } public final long getStart() { return this.start; } public final long getStop() { return this.stop; } public final String toString() { if ( throughEndOfContigP() && atBeginningOfContigP() ) return getContig(); else if ( throughEndOfContigP() || getStart() == getStop() ) return String.format("%s:%d", getContig(), getStart()); else return String.format("%s:%d-%d", getContig(), getStart(), getStop()); } public final boolean isUnmapped() { return this.contigIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } public final boolean atBeginningOfContigP() { return this.start == 1; } public void setContig(String contig) { this.contigIndex = contigInfo.getSequenceIndex(contig); } public void setStart(long start) { this.start = start; } public void setStop(long stop) { this.stop = stop; } public final boolean isSingleBP() { return stop == start; } public final boolean disjointP(GenomeLoc that) { if ( this.contigIndex != that.contigIndex ) return true; // different chromosomes if ( this.start > that.stop ) return true; // this guy is past that if ( that.start > this.stop ) return true; // that guy is past our start return false; } public final boolean discontinuousP(GenomeLoc that) { if ( this.contigIndex != that.contigIndex ) return true; // different chromosomes if ( (this.start - 1) > that.stop ) return true; // this guy is past that if ( (that.start - 1) > this.stop ) return true; // that guy is past our start return false; } public final boolean overlapsP(GenomeLoc that) { return ! disjointP( that ); } public final boolean contiguousP(GenomeLoc that) { return ! discontinuousP( that ); } public GenomeLoc merge( GenomeLoc that ) throws StingException { if (!(this.contiguousP(that))) { throw new StingException("The two genome loc's need to be contigous"); } return new GenomeLoc(getContig(), Math.min(getStart(), that.getStart()), Math.max( getStop(), that.getStop()) ); } public final boolean containsP(GenomeLoc that) { if ( ! onSameContig(that) ) return false; return getStart() <= that.getStart() && getStop() >= that.getStop(); } public final boolean onSameContig(GenomeLoc that) { return (this.contigIndex == that.contigIndex); } public final int minus( final GenomeLoc that ) { if ( this.contigIndex == that.contigIndex ) return (int) (this.getStart() - that.getStart()); else return Integer.MAX_VALUE; } public final int distance( final GenomeLoc that ) { return Math.abs(minus(that)); } public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { return this.compareTo(left) > -1 && this.compareTo(right) < 1; } public final boolean isPast( GenomeLoc that ) { int comparison = this.compareContigs(that); return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() )); } public final void incPos() { incPos(1); } public final void incPos(long by) { this.start += by; this.stop += by; } public final GenomeLoc nextLoc() { GenomeLoc n = new GenomeLoc(this); n.incPos(); return n; } // Dangerous // public boolean equals(Object o) { // // Not strictly necessary, but often a good optimization // if (this == o) // return true; // if (!(o instanceof GenomeLoc)) // return false; // else // return compareContigs((GenomeLoc)o) == 0; // } // // Comparison operations // // TODO: get rid of this method because it's sloooooooooooooow @Deprecated public static int compareContigs( final String thisContig, final String thatContig ) { if ( thisContig == thatContig ) { // Optimization. If the pointers are equal, then the contigs are equal. return 0; } if ( hasKnownContigOrdering() ) { int thisIndex = getContigIndex(thisContig); int thatIndex = getContigIndex(thatContig); if ( thisIndex == -1 ) { if ( thatIndex == -1 ) { // Use regular sorted order return thisContig.compareTo(thatContig); } else { // this is always bigger if that is in the key set return 1; } } else if ( thatIndex == -1 ) { return -1; } else { if ( thisIndex < thatIndex ) return -1; if ( thisIndex > thatIndex ) return 1; return 0; } } else { return thisContig.compareTo(thatContig); } } public final int compareContigs( GenomeLoc that ) { return (this.contigIndex == that.contigIndex)?0:((this.contigIndex < that.contigIndex)?-1:1); } public int compareTo( GenomeLoc that ) { if ( this == that ) return 0; final int cmpContig = compareContigs(that); if ( cmpContig != 0 ) return cmpContig; if ( this.getStart() < that.getStart() ) return -1; if ( this.getStart() > that.getStart() ) return 1; // TODO: and error is being thrown because we are treating reads with the same start positions // but different stop as out of order //if ( this.getStop() < that.getStop() ) return -1; //if ( this.getStop() > that.getStop() ) return 1; return 0; } /** * Read a file of genome locations to process. * regions specified by the location string. The string is of the form: * Of the form: loc1;loc2;... * Where each locN can be: * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param file_name */ public static ArrayList IntervalFileToList(final String file_name) { // first try to read it as an interval file since that's well structured // we'll fail quickly if it's not a valid file. Then try to parse it as // a location string file ArrayList ret = null; try { IntervalList il = IntervalList.fromFile(new File(file_name)); // iterate through the list of merged intervals and add then as GenomeLocs ret = new ArrayList(); for(Interval interval : il.getUniqueIntervals()) { ret.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); } return ret; } catch (Exception e) { try { xReadLines reader = new xReadLines(new File(file_name)); List lines = reader.readLines(); reader.close(); String locStr = Utils.join(";", lines); logger.debug("locStr: " + locStr); ret = parseGenomeLocs(locStr); return ret; } catch (Exception e2) { e2.printStackTrace(); throw new StingException("Unable to parse out interval file in either format", e); } } } }