2009-03-12 04:58:01 +08:00
package org.broadinstitute.sting.utils ;
2009-03-03 02:18:48 +08:00
2009-04-16 02:29:38 +08:00
import edu.mit.broad.picard.util.Interval ;
2009-05-29 04:13:01 +08:00
import edu.mit.broad.picard.directed.IntervalList ;
import net.sf.picard.reference.ReferenceSequenceFile ;
2009-04-16 02:29:38 +08:00
import net.sf.samtools.SAMRecord ;
2009-03-29 04:37:27 +08:00
import net.sf.samtools.SAMSequenceDictionary ;
import net.sf.samtools.SAMSequenceRecord ;
2009-04-16 02:29:38 +08:00
import org.apache.log4j.Logger ;
2009-03-29 04:37:27 +08:00
2009-04-16 02:29:38 +08:00
import java.io.File ;
2009-03-03 02:18:48 +08:00
import java.util.* ;
import java.util.regex.Matcher ;
2009-04-16 02:29:38 +08:00
import java.util.regex.Pattern ;
2009-03-29 04:37:27 +08:00
2009-03-03 02:18:48 +08:00
/ * *
* Created by IntelliJ IDEA .
* User : mdepristo
* Date : Mar 2 , 2009
* Time : 8 : 50 : 11 AM
*
* Genome location representation . It is * * * 1 * * * based
*
*
* /
2009-05-14 02:51:16 +08:00
public class GenomeLoc implements Comparable < GenomeLoc > , Cloneable {
2009-03-29 04:37:27 +08:00
private static Logger logger = Logger . getLogger ( GenomeLoc . class ) ;
2009-06-18 04:19:47 +08:00
private int contigIndex ;
2009-03-03 02:18:48 +08:00
private long start ;
private long stop ;
2009-03-29 04:37:27 +08:00
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
//
// Ugly global variable defining the optional ordering of contig elements
//
2009-03-29 04:37:27 +08:00
// --------------------------------------------------------------------------------------------------------------
//public static Map<String, Integer> refContigOrdering = null;
private static SAMSequenceDictionary contigInfo = null ;
public static boolean hasKnownContigOrdering ( ) {
return contigInfo ! = null ;
}
2009-04-30 21:54:51 +08:00
2009-03-29 04:37:27 +08:00
public static SAMSequenceRecord getContigInfo ( final String contig ) {
return contigInfo . getSequence ( contig ) ;
}
2009-04-17 05:54:56 +08:00
2009-04-30 21:54:51 +08:00
/ * *
* Returns the contig index of a specified string version of the contig
* @param contig the contig string
* @return the contig index , - 1 if not found
* /
public static int getContigIndex ( final String contig ) {
2009-04-17 05:54:56 +08:00
if ( contigInfo . getSequenceIndex ( contig ) = = - 1 )
Utils . scareUser ( String . format ( "Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary" , contig ) ) ;
2009-03-29 04:37:27 +08:00
return contigInfo . getSequenceIndex ( contig ) ;
}
2009-03-13 07:30:19 +08:00
2009-03-29 04:37:27 +08:00
public static boolean setupRefContigOrdering ( final ReferenceSequenceFile refFile ) {
return setupRefContigOrdering ( refFile . getSequenceDictionary ( ) ) ;
}
2009-03-03 02:18:48 +08:00
2009-03-29 04:37:27 +08:00
public static boolean setupRefContigOrdering ( final SAMSequenceDictionary seqDict ) {
2009-04-08 05:44:39 +08:00
if ( seqDict = = null ) { // we couldn't load the reference dictionary
logger . info ( "Failed to load reference dictionary, falling back to lexicographic order for contigs" ) ;
2009-04-14 08:53:08 +08:00
Utils . scareUser ( "Failed to load reference dictionary" ) ;
2009-03-29 04:37:27 +08:00
return false ;
2009-05-01 06:14:26 +08:00
} else if ( contigInfo = = null ) {
2009-03-29 04:37:27 +08:00
contigInfo = seqDict ;
2009-04-10 06:04:59 +08:00
logger . debug ( String . format ( "Prepared reference sequence contig dictionary" ) ) ;
2009-03-29 04:37:27 +08:00
for ( SAMSequenceRecord contig : seqDict . getSequences ( ) ) {
2009-04-10 06:04:59 +08:00
logger . debug ( String . format ( " %s (%d bp)" , contig . getSequenceName ( ) , contig . getSequenceLength ( ) ) ) ;
2009-03-29 04:37:27 +08:00
}
}
return true ;
}
// --------------------------------------------------------------------------------------------------------------
//
// constructors
//
// --------------------------------------------------------------------------------------------------------------
2009-04-12 10:25:17 +08:00
public GenomeLoc ( int contigIndex , final long start , final long stop ) {
2009-04-30 21:54:51 +08:00
if ( contigInfo = = null ) { throw new StingException ( "Contig info has not been setup in the GenomeLoc context yet." ) ; }
2009-06-03 02:14:46 +08:00
if ( ! isSequenceIndexValid ( contigIndex ) ) {
2009-04-30 21:54:51 +08:00
throw new StingException ( "Contig info has not been setup in the GenomeLoc context yet." ) ;
}
if ( start < 0 ) { throw new StingException ( "Bad start position " + start ) ; }
if ( stop < - 1 ) { throw new StingException ( "Bad stop position " + stop ) ; } // a negative -1 indicates it's not a meaningful end position
2009-03-13 07:30:19 +08:00
2009-04-12 10:25:17 +08:00
this . contigIndex = contigIndex ;
2009-03-03 02:18:48 +08:00
this . start = start ;
2009-04-16 03:12:28 +08:00
this . stop = stop = = - 1 ? start : stop ;
2009-03-03 02:18:48 +08:00
}
2009-04-14 08:53:08 +08:00
public GenomeLoc ( final SAMRecord read ) {
this ( read . getReferenceIndex ( ) , read . getAlignmentStart ( ) , read . getAlignmentEnd ( ) ) ;
}
public GenomeLoc ( final String contig , final long start , final long stop ) {
2009-04-12 10:25:17 +08:00
this ( contigInfo . getSequenceIndex ( contig ) , start , stop ) ;
}
2009-03-03 02:18:48 +08:00
public GenomeLoc ( final String contig , final long pos ) {
2009-04-14 08:53:08 +08:00
this ( contig , pos , pos ) ;
2009-04-12 10:25:17 +08:00
}
public GenomeLoc ( final int contig , final long pos ) {
this ( contig , pos , pos ) ;
2009-03-03 02:18:48 +08:00
}
2009-03-13 07:30:19 +08:00
public GenomeLoc ( final GenomeLoc toCopy ) {
2009-04-12 10:25:17 +08:00
this ( toCopy . contigIndex , toCopy . getStart ( ) , toCopy . getStop ( ) ) ;
2009-03-13 07:30:19 +08:00
}
2009-03-29 04:37:27 +08:00
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
//
// Parsing string representations
//
2009-03-29 04:37:27 +08:00
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
private static long parsePosition ( final String pos ) {
String x = pos . replaceAll ( "," , "" ) ;
2009-03-13 07:30:19 +08:00
return Long . parseLong ( x ) ;
2009-03-03 02:18:48 +08:00
}
2009-06-03 02:14:46 +08:00
/ * *
* Use this static constructor when the input data is under limited control ( i . e . parsing user data ) .
* @param contig Contig to parse .
* @param start Starting point .
* @param stop Stop point .
* @return The genome location , or a MalformedGenomeLocException if unparseable .
* /
public static GenomeLoc parseGenomeLoc ( final String contig , long start , long stop ) {
if ( ! isContigValid ( contig ) )
2009-06-05 23:49:03 +08:00
throw new MalformedGenomeLocException ( "Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference." ) ;
2009-06-03 02:14:46 +08:00
return new GenomeLoc ( contig , start , stop ) ;
}
2009-03-03 02:18:48 +08:00
public static GenomeLoc parseGenomeLoc ( final String str ) {
2009-04-04 02:24:08 +08:00
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
2009-03-25 10:17:48 +08:00
//System.out.printf("Parsing location '%s'%n", str);
2009-03-03 02:18:48 +08:00
2009-04-15 06:13:10 +08:00
final Pattern regex1 = Pattern . compile ( "([\\w&&[^:]]+)$" ) ; // matches case 1
final Pattern regex2 = Pattern . compile ( "([\\w&&[^:]]+):([\\d,]+)$" ) ; // matches case 2
final Pattern regex3 = Pattern . compile ( "([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$" ) ; // matches case 3
final Pattern regex4 = Pattern . compile ( "([\\w&&[^:]]+):([\\d,]+)\\+" ) ; // matches case 4
2009-03-03 02:18:48 +08:00
String contig = null ;
long start = 1 ;
long stop = Integer . MAX_VALUE ;
boolean bad = false ;
2009-03-13 07:30:19 +08:00
2009-03-03 02:18:48 +08:00
Matcher match1 = regex1 . matcher ( str ) ;
Matcher match2 = regex2 . matcher ( str ) ;
Matcher match3 = regex3 . matcher ( str ) ;
2009-04-15 06:13:10 +08:00
Matcher match4 = regex4 . matcher ( str ) ;
2009-03-03 02:18:48 +08:00
try {
if ( match1 . matches ( ) ) {
contig = match1 . group ( 1 ) ;
}
else if ( match2 . matches ( ) ) {
contig = match2 . group ( 1 ) ;
start = parsePosition ( match2 . group ( 2 ) ) ;
2009-04-12 10:25:17 +08:00
stop = start ;
2009-03-03 02:18:48 +08:00
}
2009-04-15 06:13:10 +08:00
else if ( match4 . matches ( ) ) {
contig = match4 . group ( 1 ) ;
start = parsePosition ( match4 . group ( 2 ) ) ;
}
2009-03-03 02:18:48 +08:00
else if ( match3 . matches ( ) ) {
contig = match3 . group ( 1 ) ;
start = parsePosition ( match3 . group ( 2 ) ) ;
stop = parsePosition ( match3 . group ( 3 ) ) ;
if ( start > stop )
bad = true ;
}
else {
bad = true ;
}
} catch ( Exception e ) {
bad = true ;
}
if ( bad ) {
2009-04-30 21:54:51 +08:00
throw new StingException ( "Invalid Genome Location string: " + str ) ;
2009-03-03 02:18:48 +08:00
}
2009-03-29 04:37:27 +08:00
if ( stop = = Integer . MAX_VALUE & & hasKnownContigOrdering ( ) ) {
// lookup the actually stop position!
stop = getContigInfo ( contig ) . getSequenceLength ( ) ;
}
2009-06-03 02:14:46 +08:00
if ( ! isContigValid ( contig ) )
2009-06-05 23:49:03 +08:00
throw new MalformedGenomeLocException ( "Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference." ) ;
2009-06-03 02:14:46 +08:00
GenomeLoc loc = parseGenomeLoc ( contig , start , stop ) ;
2009-04-08 05:44:39 +08:00
// System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
2009-03-03 02:18:48 +08:00
return loc ;
}
2009-03-29 04:37:27 +08:00
/ * *
* Useful utility function that parses a location string into a coordinate - order sorted
* array of GenomeLoc objects
*
2009-04-10 04:28:17 +08:00
* @param str String representation of genome locs . Null string corresponds to no filter .
2009-03-29 04:37:27 +08:00
* @return Array of GenomeLoc objects corresponding to the locations in the string , sorted by coordinate order
* /
2009-05-28 09:35:49 +08:00
public static List < GenomeLoc > parseGenomeLocs ( final String str ) {
2009-04-10 04:28:17 +08:00
// Null string means no filter.
if ( str = = null ) return null ;
2009-03-29 04:37:27 +08:00
// Of the form: loc1;loc2;...
// Where each locN can be:
2009-04-04 02:24:08 +08:00
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
2009-03-29 04:37:27 +08:00
try {
2009-05-28 09:35:49 +08:00
List < GenomeLoc > locs = new ArrayList < GenomeLoc > ( ) ;
for ( String loc : str . split ( ";" ) )
locs . add ( parseGenomeLoc ( loc . trim ( ) ) ) ;
2009-03-29 04:37:27 +08:00
Collections . sort ( locs ) ;
//logger.info(String.format("Going to process %d locations", locs.length));
locs = mergeOverlappingLocations ( locs ) ;
2009-06-13 00:29:26 +08:00
logger . debug ( "Locations are:" + Utils . join ( ", " , locs ) ) ;
2009-03-29 04:37:27 +08:00
return locs ;
} catch ( Exception e ) {
e . printStackTrace ( ) ;
Utils . scareUser ( String . format ( "Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'" , str ) ) ;
return null ;
}
}
2009-05-28 09:35:49 +08:00
public static List < GenomeLoc > mergeOverlappingLocations ( final List < GenomeLoc > raw ) {
logger . debug ( " Raw locations are:\n" + Utils . join ( "\n" , raw ) ) ;
2009-03-29 04:37:27 +08:00
if ( raw . size ( ) < = 1 )
return raw ;
else {
ArrayList < GenomeLoc > merged = new ArrayList < GenomeLoc > ( ) ;
Iterator < GenomeLoc > it = raw . iterator ( ) ;
GenomeLoc prev = it . next ( ) ;
while ( it . hasNext ( ) ) {
GenomeLoc curr = it . next ( ) ;
if ( prev . contiguousP ( curr ) ) {
prev = prev . merge ( curr ) ;
} else {
merged . add ( prev ) ;
prev = curr ;
}
}
merged . add ( prev ) ;
return merged ;
}
}
2009-04-24 03:03:14 +08:00
/ * *
* Move this Genome loc to the next contig , with a start
* and stop of 1.
* @return true if we are not out of contigs , otherwise false if we ' re
* at the end of the genome ( no more contigs to jump to ) .
* /
public boolean toNextContig ( ) {
2009-04-24 03:08:16 +08:00
if ( ( contigIndex + 1 ) < GenomeLoc . contigInfo . size ( ) ) {
2009-04-24 03:03:14 +08:00
this . contigIndex + + ;
this . start = 1 ;
this . stop = 1 ;
return true ;
}
return false ;
}
2009-03-29 04:37:27 +08:00
/ * *
* Returns true iff we have a specified series of locations to process AND we are past the last
* location in the list . It means that , in a serial processing of the genome , that we are done .
*
* @param curr Current genome Location
* @return true if we are past the last location to process
* /
2009-05-12 09:22:18 +08:00
public static boolean pastFinalLocation ( GenomeLoc curr , List < GenomeLoc > locs ) {
2009-04-15 03:13:16 +08:00
return ( locs . size ( ) > 0 & & curr . isPast ( locs . get ( locs . size ( ) - 1 ) ) ) ;
2009-03-29 04:37:27 +08:00
}
/ * *
* A key function that returns true if the proposed GenomeLoc curr is within the list of
* locations we are processing in this TraversalEngine
*
* @param curr
* @return true if we should process GenomeLoc curr , otherwise false
* /
public static boolean inLocations ( GenomeLoc curr , ArrayList < GenomeLoc > locs ) {
2009-03-31 07:43:12 +08:00
if ( locs . size ( ) = = 0 ) {
2009-03-29 04:37:27 +08:00
return true ;
} else {
2009-04-04 02:24:08 +08:00
for ( GenomeLoc loc : locs ) {
2009-03-29 04:37:27 +08:00
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
2009-04-04 02:24:08 +08:00
if ( loc . overlapsP ( curr ) )
2009-03-29 04:37:27 +08:00
return true ;
}
return false ;
}
}
2009-04-04 02:24:08 +08:00
public static void removePastLocs ( GenomeLoc curr , List < GenomeLoc > locs ) {
while ( ! locs . isEmpty ( ) & & curr . isPast ( locs . get ( 0 ) ) ) {
//System.out.println("At: " + curr + ", removing: " + locs.get(0));
locs . remove ( 0 ) ;
}
}
public static boolean overlapswithSortedLocsP ( GenomeLoc curr , List < GenomeLoc > locs , boolean returnTrueIfEmpty ) {
if ( locs . isEmpty ( ) )
return returnTrueIfEmpty ;
// skip loci before intervals begin
2009-04-12 10:25:17 +08:00
if ( hasKnownContigOrdering ( ) & & curr . contigIndex < locs . get ( 0 ) . contigIndex )
2009-04-04 02:24:08 +08:00
return false ;
for ( GenomeLoc loc : locs ) {
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
if ( loc . overlapsP ( curr ) )
return true ;
if ( curr . compareTo ( loc ) < 0 )
return false ;
}
return false ;
}
2009-03-03 02:18:48 +08:00
//
// Accessors and setters
//
2009-04-13 08:48:21 +08:00
public final String getContig ( ) {
2009-04-30 21:54:51 +08:00
//this.contigIndex != -1;
if ( ! ( contigInfo ! = null & & contigInfo . getSequences ( ) ! = null ) ) {
throw new StingException ( "The contig information or it's sequences are null" ) ;
}
if ( ( this . contigIndex < 0 ) | | ( this . contigIndex > = contigInfo . getSequences ( ) . size ( ) ) ) {
throw new StingException ( "The contig index is not bounded by the zero and seqeunce count, contig index: " + contigIndex ) ;
}
if ( contigInfo . getSequence ( this . contigIndex ) = = null | |
contigInfo . getSequence ( this . contigIndex ) . getSequenceName ( ) = = null ) {
throw new StingException ( "The associated sequence index for contig " + contigIndex + " is null" ) ;
}
2009-04-14 08:53:08 +08:00
return contigInfo . getSequence ( this . contigIndex ) . getSequenceName ( ) ;
//if (contigInfo != null && contigInfo.getSequence(this.contigIndex) != null) {
// return contigInfo.getSequence(this.contigIndex).getSequenceName();
//}
2009-04-13 08:48:21 +08:00
2009-04-14 08:53:08 +08:00
//return null;
2009-04-13 08:48:21 +08:00
}
2009-04-12 10:25:17 +08:00
public final int getContigIndex ( ) { return this . contigIndex ; }
2009-03-03 02:18:48 +08:00
public final long getStart ( ) { return this . start ; }
public final long getStop ( ) { return this . stop ; }
public final String toString ( ) {
if ( throughEndOfContigP ( ) & & atBeginningOfContigP ( ) )
return getContig ( ) ;
else if ( throughEndOfContigP ( ) | | getStart ( ) = = getStop ( ) )
return String . format ( "%s:%d" , getContig ( ) , getStart ( ) ) ;
else
return String . format ( "%s:%d-%d" , getContig ( ) , getStart ( ) , getStop ( ) ) ;
}
2009-04-12 10:25:17 +08:00
public final boolean isUnmapped ( ) { return this . contigIndex = = SAMRecord . NO_ALIGNMENT_REFERENCE_INDEX ; }
2009-03-03 02:18:48 +08:00
public final boolean throughEndOfContigP ( ) { return this . stop = = Integer . MAX_VALUE ; }
public final boolean atBeginningOfContigP ( ) { return this . start = = 1 ; }
public void setContig ( String contig ) {
2009-04-12 10:25:17 +08:00
this . contigIndex = contigInfo . getSequenceIndex ( contig ) ;
2009-03-03 02:18:48 +08:00
}
public void setStart ( long start ) {
this . start = start ;
}
public void setStop ( long stop ) {
this . stop = stop ;
}
public final boolean isSingleBP ( ) { return stop = = start ; }
public final boolean disjointP ( GenomeLoc that ) {
2009-04-12 10:25:17 +08:00
if ( this . contigIndex ! = that . contigIndex ) return true ; // different chromosomes
if ( this . start > that . stop ) return true ; // this guy is past that
if ( that . start > this . stop ) return true ; // that guy is past our start
2009-03-03 02:18:48 +08:00
return false ;
}
2009-03-13 07:30:19 +08:00
2009-03-29 04:37:27 +08:00
public final boolean discontinuousP ( GenomeLoc that ) {
2009-04-12 10:25:17 +08:00
if ( this . contigIndex ! = that . contigIndex ) return true ; // different chromosomes
if ( ( this . start - 1 ) > that . stop ) return true ; // this guy is past that
if ( ( that . start - 1 ) > this . stop ) return true ; // that guy is past our start
2009-03-29 04:37:27 +08:00
return false ;
}
2009-03-03 02:18:48 +08:00
public final boolean overlapsP ( GenomeLoc that ) {
return ! disjointP ( that ) ;
}
2009-03-29 04:37:27 +08:00
public final boolean contiguousP ( GenomeLoc that ) {
return ! discontinuousP ( that ) ;
}
2009-04-30 21:54:51 +08:00
public GenomeLoc merge ( GenomeLoc that ) throws StingException {
if ( ! ( this . contiguousP ( that ) ) ) {
throw new StingException ( "The two genome loc's need to be contigous" ) ;
}
2009-03-29 04:37:27 +08:00
return new GenomeLoc ( getContig ( ) ,
Math . min ( getStart ( ) , that . getStart ( ) ) ,
Math . max ( getStop ( ) , that . getStop ( ) ) ) ;
}
public final boolean containsP ( GenomeLoc that ) {
if ( ! onSameContig ( that ) ) return false ;
return getStart ( ) < = that . getStart ( ) & & getStop ( ) > = that . getStop ( ) ;
}
2009-03-03 02:18:48 +08:00
public final boolean onSameContig ( GenomeLoc that ) {
2009-04-12 10:25:17 +08:00
return ( this . contigIndex = = that . contigIndex ) ;
2009-03-03 02:18:48 +08:00
}
2009-03-13 07:30:19 +08:00
public final int minus ( final GenomeLoc that ) {
2009-04-12 10:25:17 +08:00
if ( this . contigIndex = = that . contigIndex )
2009-03-13 07:30:19 +08:00
return ( int ) ( this . getStart ( ) - that . getStart ( ) ) ;
else
return Integer . MAX_VALUE ;
}
public final int distance ( final GenomeLoc that ) {
return Math . abs ( minus ( that ) ) ;
}
public final boolean isBetween ( final GenomeLoc left , final GenomeLoc right ) {
return this . compareTo ( left ) > - 1 & & this . compareTo ( right ) < 1 ;
}
2009-05-14 02:51:16 +08:00
public final boolean isBefore ( GenomeLoc that ) {
int comparison = this . compareContigs ( that ) ;
return ( comparison = = - 1 | | ( comparison = = 0 & & this . getStop ( ) < that . getStart ( ) ) ) ;
}
2009-03-29 04:37:27 +08:00
public final boolean isPast ( GenomeLoc that ) {
2009-04-04 04:05:24 +08:00
int comparison = this . compareContigs ( that ) ;
return ( comparison = = 1 | | ( comparison = = 0 & & this . getStart ( ) > that . getStop ( ) ) ) ;
2009-03-29 04:37:27 +08:00
}
2009-03-13 07:30:19 +08:00
public final void incPos ( ) {
incPos ( 1 ) ;
}
public final void incPos ( long by ) {
this . start + = by ;
this . stop + = by ;
}
2009-03-03 02:18:48 +08:00
2009-03-13 07:30:19 +08:00
public final GenomeLoc nextLoc ( ) {
GenomeLoc n = new GenomeLoc ( this ) ;
n . incPos ( ) ;
return n ;
}
2009-05-01 06:14:26 +08:00
2009-05-14 02:51:16 +08:00
/ * *
* Check to see whether two genomeLocs are equal .
* Note that this implementation ignores the contigInfo object .
* @param other Other contig to compare .
* /
@Override
public boolean equals ( Object other ) {
if ( other = = null )
return false ;
if ( other instanceof GenomeLoc ) {
GenomeLoc otherGenomeLoc = ( GenomeLoc ) other ;
2009-06-18 04:19:47 +08:00
return this . contigIndex = = otherGenomeLoc . contigIndex & &
2009-05-14 02:51:16 +08:00
this . start = = otherGenomeLoc . start & &
this . stop = = otherGenomeLoc . stop ;
}
return false ;
}
2009-06-09 00:52:02 +08:00
@Override
public int hashCode ( ) {
return ( int ) ( start < < 16 + stop < < 4 + contigIndex ) ;
}
2009-05-14 02:51:16 +08:00
/ * *
* Return a new GenomeLoc at this same position .
* @return A GenomeLoc with the same contents as the current loc .
* /
@Override
2009-05-27 04:57:46 +08:00
public GenomeLoc clone ( ) {
2009-05-14 02:51:16 +08:00
return new GenomeLoc ( this ) ;
}
2009-05-01 06:14:26 +08:00
2009-03-03 02:18:48 +08:00
//
// Comparison operations
//
2009-04-12 10:25:17 +08:00
// TODO: get rid of this method because it's sloooooooooooooow
2009-04-14 08:53:08 +08:00
@Deprecated
public static int compareContigs ( final String thisContig , final String thatContig )
2009-03-22 23:36:56 +08:00
{
2009-03-13 07:30:19 +08:00
if ( thisContig = = thatContig )
2009-03-22 23:36:56 +08:00
{
2009-03-23 03:53:00 +08:00
// Optimization. If the pointers are equal, then the contigs are equal.
2009-03-13 07:30:19 +08:00
return 0 ;
2009-03-22 23:36:56 +08:00
}
2009-04-14 08:53:08 +08:00
if ( hasKnownContigOrdering ( ) )
2009-03-22 23:36:56 +08:00
{
2009-03-29 04:37:27 +08:00
int thisIndex = getContigIndex ( thisContig ) ;
int thatIndex = getContigIndex ( thatContig ) ;
if ( thisIndex = = - 1 )
2009-03-22 23:36:56 +08:00
{
2009-03-29 04:37:27 +08:00
if ( thatIndex = = - 1 )
2009-03-22 23:36:56 +08:00
{
2009-03-03 02:18:48 +08:00
// Use regular sorted order
return thisContig . compareTo ( thatContig ) ;
}
2009-04-14 08:53:08 +08:00
else
2009-03-22 23:36:56 +08:00
{
2009-03-03 02:18:48 +08:00
// this is always bigger if that is in the key set
return 1 ;
}
}
2009-03-29 04:37:27 +08:00
else if ( thatIndex = = - 1 )
2009-03-22 23:36:56 +08:00
{
2009-03-03 02:18:48 +08:00
return - 1 ;
2009-03-22 23:36:56 +08:00
}
2009-04-14 08:53:08 +08:00
else
2009-03-22 23:36:56 +08:00
{
2009-03-29 04:37:27 +08:00
if ( thisIndex < thatIndex ) return - 1 ;
if ( thisIndex > thatIndex ) return 1 ;
2009-03-03 02:18:48 +08:00
return 0 ;
}
}
2009-04-14 08:53:08 +08:00
else
2009-03-22 23:36:56 +08:00
{
2009-03-03 02:18:48 +08:00
return thisContig . compareTo ( thatContig ) ;
}
}
2009-04-12 10:25:17 +08:00
public final int compareContigs ( GenomeLoc that ) {
2009-06-18 04:19:47 +08:00
if ( this . contigIndex = = that . contigIndex )
return 0 ;
else if ( this . contigIndex > that . contigIndex )
return 1 ;
return - 1 ;
2009-03-03 02:18:48 +08:00
}
public int compareTo ( GenomeLoc that ) {
if ( this = = that ) return 0 ;
2009-04-12 10:25:17 +08:00
final int cmpContig = compareContigs ( that ) ;
2009-03-03 02:18:48 +08:00
if ( cmpContig ! = 0 ) return cmpContig ;
if ( this . getStart ( ) < that . getStart ( ) ) return - 1 ;
if ( this . getStart ( ) > that . getStart ( ) ) return 1 ;
2009-04-04 03:53:33 +08:00
// TODO: and error is being thrown because we are treating reads with the same start positions
// but different stop as out of order
//if ( this.getStop() < that.getStop() ) return -1;
//if ( this.getStop() > that.getStop() ) return 1;
2009-03-03 02:18:48 +08:00
return 0 ;
}
2009-04-16 02:29:38 +08:00
/ * *
* Read a file of genome locations to process .
* regions specified by the location string . The string is of the form :
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
* ' chr2 ' , ' chr2 : 1000000 ' or ' chr2 : 1 , 000 , 000 - 2 , 000 , 000 '
*
* @param file_name
* /
2009-05-28 09:35:49 +08:00
public static List < GenomeLoc > IntervalFileToList ( final String file_name ) {
2009-04-16 02:29:38 +08:00
// first try to read it as an interval file since that's well structured
// we'll fail quickly if it's not a valid file. Then try to parse it as
// a location string file
2009-05-28 09:35:49 +08:00
List < GenomeLoc > ret = null ;
2009-04-16 02:29:38 +08:00
try {
IntervalList il = IntervalList . fromFile ( new File ( file_name ) ) ;
// iterate through the list of merged intervals and add then as GenomeLocs
ret = new ArrayList < GenomeLoc > ( ) ;
for ( Interval interval : il . getUniqueIntervals ( ) ) {
ret . add ( new GenomeLoc ( interval . getSequence ( ) , interval . getStart ( ) , interval . getEnd ( ) ) ) ;
}
return ret ;
} catch ( Exception e ) {
try {
xReadLines reader = new xReadLines ( new File ( file_name ) ) ;
List < String > lines = reader . readLines ( ) ;
reader . close ( ) ;
String locStr = Utils . join ( ";" , lines ) ;
logger . debug ( "locStr: " + locStr ) ;
ret = parseGenomeLocs ( locStr ) ;
return ret ;
} catch ( Exception e2 ) {
2009-06-05 02:24:43 +08:00
logger . error ( "Attempt to parse interval file in GATK format failed: " + e2 . getMessage ( ) ) ;
2009-04-16 02:29:38 +08:00
e2 . printStackTrace ( ) ;
2009-04-30 21:54:51 +08:00
throw new StingException ( "Unable to parse out interval file in either format" , e ) ;
2009-04-16 02:29:38 +08:00
}
}
}
2009-06-03 02:14:46 +08:00
/ * *
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc .
* @return True if the contig is valid . False otherwise .
* /
private static boolean isContigValid ( String contig ) {
int contigIndex = contigInfo . getSequenceIndex ( contig ) ;
return isSequenceIndexValid ( contigIndex ) ;
}
/ * *
* Determines whether the given sequence index is valid with respect to the sequence dictionary .
* @param sequenceIndex sequence index
* @return True if the sequence index is valid , false otherwise .
* /
private static boolean isSequenceIndexValid ( int sequenceIndex ) {
return sequenceIndex > = 0 & & sequenceIndex < contigInfo . size ( ) ;
}
2009-03-22 23:36:56 +08:00
}