2010-04-20 07:00:08 +08:00
/ *
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2010-04-20 07:00:08 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2010-04-20 07:00:08 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2009-06-22 22:39:41 +08:00
package org.broadinstitute.sting.utils ;
2010-06-11 04:54:36 +08:00
import java.io.File ;
2010-06-22 05:49:41 +08:00
import java.io.IOException ;
2010-06-11 04:54:36 +08:00
import java.util.ArrayList ;
import java.util.Iterator ;
import java.util.List ;
2010-06-10 03:25:02 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2010-06-11 04:54:36 +08:00
import net.sf.picard.util.Interval ;
import net.sf.picard.util.IntervalList ;
2009-06-22 22:39:41 +08:00
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.SAMSequenceDictionary ;
import net.sf.samtools.SAMSequenceRecord ;
2010-06-11 04:54:36 +08:00
2009-06-22 22:39:41 +08:00
import org.apache.log4j.Logger ;
2009-08-21 22:40:57 +08:00
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2010-01-15 08:14:35 +08:00
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
2010-04-01 20:47:48 +08:00
import org.broadinstitute.sting.utils.bed.BedParser ;
2010-09-12 23:07:38 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
2010-09-12 22:02:43 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-06-11 04:54:36 +08:00
import org.broadinstitute.sting.utils.interval.IntervalMergingRule ;
2010-04-20 07:00:08 +08:00
import org.broadinstitute.sting.utils.text.XReadLines ;
2009-06-22 22:39:41 +08:00
/ * *
* Created by IntelliJ IDEA .
2009-06-25 00:35:46 +08:00
* User : aaron
2009-06-22 22:39:41 +08:00
* Date : Jun 18 , 2009
* Time : 11 : 17 : 01 PM
* To change this template use File | Settings | File Templates .
* /
public class GenomeLocParser {
private static Logger logger = Logger . getLogger ( GenomeLocParser . class ) ;
// --------------------------------------------------------------------------------------------------------------
//
// Ugly global variable defining the optional ordering of contig elements
//
// --------------------------------------------------------------------------------------------------------------
//public static Map<String, Integer> refContigOrdering = null;
2010-11-11 01:59:50 +08:00
protected SAMSequenceDictionary contigInfo = null ;
2009-06-22 22:39:41 +08:00
/ * *
2010-11-11 01:59:50 +08:00
* set our internal reference contig order
* @param refFile the reference file
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
public GenomeLocParser ( final ReferenceSequenceFile refFile ) {
this ( refFile . getSequenceDictionary ( ) ) ;
}
public GenomeLocParser ( SAMSequenceDictionary seqDict ) {
if ( seqDict = = null ) { // we couldn't load the reference dictionary
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
throw new UserException . CommandLineException ( "Failed to load reference dictionary" ) ;
} else if ( contigInfo = = null ) {
contigInfo = seqDict ;
logger . debug ( String . format ( "Prepared reference sequence contig dictionary" ) ) ;
for ( SAMSequenceRecord contig : seqDict . getSequences ( ) ) {
logger . debug ( String . format ( " %s (%d bp)" , contig . getSequenceName ( ) , contig . getSequenceLength ( ) ) ) ;
}
}
2009-06-22 22:39:41 +08:00
}
/ * *
* get the contig ' s SAMSequenceRecord
*
* @param contig the string name of the contig
*
* @return the sam sequence record
* /
2010-11-11 01:59:50 +08:00
public SAMSequenceRecord getContigInfo ( final String contig ) {
2009-06-22 22:39:41 +08:00
return contigInfo . getSequence ( contig ) ;
}
/ * *
* Returns the contig index of a specified string version of the contig
*
* @param contig the contig string
2010-02-17 04:35:35 +08:00
* @param exceptionOut in some cases we don ' t want to exception out if the contig isn ' t valid
2009-06-22 22:39:41 +08:00
*
* @return the contig index , - 1 if not found
* /
2010-11-11 01:59:50 +08:00
public int getContigIndex ( final String contig , boolean exceptionOut ) {
2010-02-17 04:35:35 +08:00
if ( contigInfo . getSequenceIndex ( contig ) = = - 1 & & exceptionOut )
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( String . format ( "Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary" , contig ) ) ;
2009-06-22 22:39:41 +08:00
return contigInfo . getSequenceIndex ( contig ) ;
}
2010-06-11 04:54:36 +08:00
/ * *
2010-04-01 20:47:48 +08:00
* parse a genome interval , from a location string
*
2010-06-11 04:54:36 +08:00
* Performs interval - style validation :
2010-04-01 20:47:48 +08:00
*
* contig is valid ; start and stop less than the end ; start < = sto
* @param str the string to parse
*
* @return a GenomeLoc representing the String
*
2009-09-22 09:32:35 +08:00
* /
2010-04-01 20:47:48 +08:00
2010-11-11 01:59:50 +08:00
public GenomeLoc parseGenomeInterval ( final String str ) {
2010-04-01 20:47:48 +08:00
GenomeLoc ret = parseGenomeLoc ( str ) ;
exceptionOnInvalidGenomeLocBounds ( ret ) ;
return ret ;
}
2009-09-22 09:32:35 +08:00
2009-06-22 22:39:41 +08:00
/ * *
* parse a genome location , from a location string
2010-06-11 04:54:36 +08:00
*
* Performs read - style validation :
2010-04-01 20:47:48 +08:00
* checks that start and stop are positive , start < stop , and the contig is valid
* does not check that genomeLoc is actually on the contig
2009-06-22 22:39:41 +08:00
*
* @param str the string to parse
*
* @return a GenomeLoc representing the String
2010-04-01 20:47:48 +08:00
*
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc parseGenomeLoc ( final String str ) {
2009-06-22 22:39:41 +08:00
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
2010-06-11 04:54:36 +08:00
2009-06-22 22:39:41 +08:00
String contig = null ;
2010-11-11 01:59:50 +08:00
int start = 1 ;
int stop = - 1 ;
2010-06-11 04:54:36 +08:00
final int colonIndex = str . indexOf ( ":" ) ;
if ( colonIndex = = - 1 ) {
contig = str . substring ( 0 , str . length ( ) ) ; // chr1
stop = Integer . MAX_VALUE ;
} else {
contig = str . substring ( 0 , colonIndex ) ;
final int dashIndex = str . indexOf ( '-' , colonIndex ) ;
try {
if ( dashIndex = = - 1 ) {
if ( str . charAt ( str . length ( ) - 1 ) = = '+' ) {
start = parsePosition ( str . substring ( colonIndex + 1 , str . length ( ) - 1 ) ) ; // chr:1+
stop = Integer . MAX_VALUE ;
} else {
start = parsePosition ( str . substring ( colonIndex + 1 ) ) ; // chr1:1
stop = start ;
}
} else {
start = parsePosition ( str . substring ( colonIndex + 1 , dashIndex ) ) ; // chr1:1-1
stop = parsePosition ( str . substring ( dashIndex + 1 ) ) ;
2010-03-11 00:25:16 +08:00
}
2010-06-11 04:54:36 +08:00
} catch ( Exception e ) {
2010-09-12 22:02:43 +08:00
throw new UserException ( "Failed to parse Genome Location string: " + str , e ) ;
2010-03-11 00:25:16 +08:00
}
2009-06-22 22:39:41 +08:00
}
2010-04-01 20:47:48 +08:00
// is the contig valid?
if ( ! isContigValid ( contig ) )
2010-09-12 22:02:43 +08:00
throw new UserException ( "Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?" ) ;
2009-09-22 06:37:47 +08:00
2010-11-11 01:59:50 +08:00
if ( stop = = Integer . MAX_VALUE )
2009-06-22 22:39:41 +08:00
// lookup the actually stop position!
stop = getContigInfo ( contig ) . getSequenceLength ( ) ;
2010-04-01 20:47:48 +08:00
GenomeLoc locus = new GenomeLoc ( contig , getContigIndex ( contig , true ) , start , stop ) ;
exceptionOnInvalidGenomeLoc ( locus ) ;
return locus ;
2009-06-22 22:39:41 +08:00
}
// --------------------------------------------------------------------------------------------------------------
//
// Parsing string representations
//
// --------------------------------------------------------------------------------------------------------------
2010-06-11 04:54:36 +08:00
/ * *
* Parses a number like 1 , 000 , 000 into a long .
* @param pos
* /
2010-11-11 01:59:50 +08:00
private int parsePosition ( final String pos ) {
2010-06-11 04:54:36 +08:00
//String x = pos.replaceAll(",", ""); - this was replaced because it uses regexps
//System.out.println("Parsing position: '" + pos + "'");
if ( pos . indexOf ( '-' ) ! = - 1 ) {
throw new NumberFormatException ( "Position: '" + pos + "' can't contain '-'." ) ;
}
if ( pos . indexOf ( ',' ) ! = - 1 ) {
final StringBuilder buffer = new StringBuilder ( ) ;
for ( int i = 0 ; i < pos . length ( ) ; i + + ) {
final char c = pos . charAt ( i ) ;
if ( c = = ',' ) {
continue ;
} else if ( c < '0' | | c > '9' ) {
throw new NumberFormatException ( "Position: '" + pos + "' contains invalid chars." ) ;
2010-11-11 01:59:50 +08:00
} else {
2010-06-11 04:54:36 +08:00
buffer . append ( c ) ;
}
}
2010-11-11 01:59:50 +08:00
return Integer . parseInt ( buffer . toString ( ) ) ;
2010-06-11 04:54:36 +08:00
} else {
2010-11-11 01:59:50 +08:00
return Integer . parseInt ( pos ) ;
2010-06-11 04:54:36 +08:00
}
2009-06-22 22:39:41 +08:00
}
/ * *
* merge a list of genome locs that may be overlapping , returning the list of unique genomic locations
*
* @param raw the unchecked genome loc list
2009-12-24 05:59:14 +08:00
* @param rule the merging rule we ' re using
2009-06-22 22:39:41 +08:00
*
* @return the list of merged locations
* /
2010-11-11 01:59:50 +08:00
public List < GenomeLoc > mergeIntervalLocations ( final List < GenomeLoc > raw , IntervalMergingRule rule ) {
2010-04-01 20:47:48 +08:00
if ( raw . size ( ) < = 1 )
2009-06-22 22:39:41 +08:00
return raw ;
else {
ArrayList < GenomeLoc > merged = new ArrayList < GenomeLoc > ( ) ;
Iterator < GenomeLoc > it = raw . iterator ( ) ;
GenomeLoc prev = it . next ( ) ;
while ( it . hasNext ( ) ) {
GenomeLoc curr = it . next ( ) ;
2009-12-24 05:59:14 +08:00
if ( prev . overlapsP ( curr ) ) {
prev = prev . merge ( curr ) ;
2010-01-15 08:14:35 +08:00
} else if ( prev . contiguousP ( curr ) & & rule = = IntervalMergingRule . ALL ) {
2009-06-22 22:39:41 +08:00
prev = prev . merge ( curr ) ;
} else {
merged . add ( prev ) ;
prev = curr ;
}
}
merged . add ( prev ) ;
return merged ;
}
}
/ * *
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc .
*
* @return True if the contig is valid . False otherwise .
* /
2010-11-11 01:59:50 +08:00
private boolean isContigValid ( String contig ) {
2009-06-22 22:39:41 +08:00
int contigIndex = contigInfo . getSequenceIndex ( contig ) ;
2010-04-01 20:47:48 +08:00
return contigIndex > = 0 & & contigIndex < contigInfo . size ( ) ;
2009-06-22 22:39:41 +08:00
}
/ * *
* Use this static constructor when the input data is under limited control ( i . e . parsing user data ) .
*
* @param contig Contig to parse .
* @param start Starting point .
* @param stop Stop point .
*
* @return The genome location , or a MalformedGenomeLocException if unparseable .
2010-04-01 20:47:48 +08:00
*
* Validation : only checks that contig is valid
* start / stop could be anything
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc parseGenomeLoc ( final String contig , int start , int stop ) {
2009-06-22 22:39:41 +08:00
if ( ! isContigValid ( contig ) )
2010-04-09 13:52:53 +08:00
throw new MalformedGenomeLocException ( "Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?" ) ;
2010-02-17 04:35:35 +08:00
return new GenomeLoc ( contig , getContigIndex ( contig , true ) , start , stop ) ;
2009-06-22 22:39:41 +08:00
}
/ * *
* Read a file of genome locations to process .
* regions specified by the location string . The string is of the form :
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
* ' chr2 ' , ' chr2 : 1000000 ' or ' chr2 : 1 , 000 , 000 - 2 , 000 , 000 '
*
2010-09-25 10:49:30 +08:00
* @param file_name interval file
* @param allowEmptyIntervalList if false empty interval lists will return null
2010-06-22 05:49:41 +08:00
* @return List < GenomeLoc > List of Genome Locs that have been parsed from file
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
public List < GenomeLoc > intervalFileToList ( final String file_name , boolean allowEmptyIntervalList ) {
2010-04-01 20:47:48 +08:00
// try to open file
2010-09-10 23:25:30 +08:00
File inputFile = new File ( file_name ) ;
2010-04-01 20:47:48 +08:00
// check if file is empty
if ( inputFile . exists ( ) & & inputFile . length ( ) < 1 ) {
2010-09-25 10:49:30 +08:00
if ( allowEmptyIntervalList )
2010-04-01 20:47:48 +08:00
return new ArrayList < GenomeLoc > ( ) ;
else {
Utils . warnUser ( "The interval file " + file_name + " is empty. The GATK will continue processing but you " +
"may want to fix (or exclude) this file." ) ;
return null ;
}
}
// case: BED file
if ( file_name . toUpperCase ( ) . endsWith ( ".BED" ) ) {
2010-11-11 01:59:50 +08:00
BedParser parser = new BedParser ( this , inputFile ) ;
2010-04-13 23:50:38 +08:00
return parser . getLocations ( ) ;
2010-04-01 20:47:48 +08:00
}
2009-06-22 22:39:41 +08:00
/ * *
2010-04-01 20:47:48 +08:00
* IF not a BED file :
2009-06-22 22:39:41 +08:00
* first try to read it as an interval file since that ' s well structured
* we ' ll fail quickly if it ' s not a valid file . Then try to parse it as
* a location string file
* /
try {
2009-08-21 13:35:49 +08:00
IntervalList il = IntervalList . fromFile ( inputFile ) ;
2009-08-21 22:40:57 +08:00
2009-06-22 22:39:41 +08:00
// iterate through the list of merged intervals and add then as GenomeLocs
2010-04-01 20:47:48 +08:00
List < GenomeLoc > ret = new ArrayList < GenomeLoc > ( ) ;
2009-06-22 22:39:41 +08:00
for ( Interval interval : il . getUniqueIntervals ( ) ) {
2010-02-17 04:35:35 +08:00
ret . add ( new GenomeLoc ( interval . getSequence ( ) , getContigIndex ( interval . getSequence ( ) , true ) , interval . getStart ( ) , interval . getEnd ( ) ) ) ;
2009-06-22 22:39:41 +08:00
}
2010-04-01 20:47:48 +08:00
// always return null instead of empty list
return ret . isEmpty ( ) ? null : ret ;
2009-06-22 22:39:41 +08:00
2010-04-01 20:47:48 +08:00
}
// if that didn't work, try parsing file as an old fashioned string file
catch ( Exception e ) {
2009-06-22 22:39:41 +08:00
try {
2010-04-01 20:47:48 +08:00
List < GenomeLoc > ret = new ArrayList < GenomeLoc > ( ) ;
2010-04-20 07:00:08 +08:00
XReadLines reader = new XReadLines ( new File ( file_name ) ) ;
2010-03-23 06:04:45 +08:00
for ( String line : reader ) {
2010-04-01 20:47:48 +08:00
ret . add ( parseGenomeInterval ( line ) ) ;
2010-03-23 06:04:45 +08:00
}
2009-06-22 22:39:41 +08:00
reader . close ( ) ;
2010-03-23 06:04:45 +08:00
2010-04-01 20:47:48 +08:00
// always return null instead of empty list
return ret . isEmpty ( ) ? null : ret ;
}
2010-06-22 05:49:41 +08:00
catch ( IOException e2 ) {
2010-09-12 22:02:43 +08:00
throw new UserException . CouldNotReadInputFile ( new File ( file_name ) , e ) ;
2009-06-22 22:39:41 +08:00
}
}
}
/ * *
* get the sequence name from a sequence index
*
* @param contigIndex get the contig index
*
* @return the string that represents that contig name
* /
2010-11-11 01:59:50 +08:00
private String getSequenceNameFromIndex ( int contigIndex ) {
return contigInfo . getSequence ( contigIndex ) . getSequenceName ( ) ;
2009-06-22 22:39:41 +08:00
}
2010-08-05 11:19:02 +08:00
/ * *
* create a genome loc , given the contig name , start , and stop
*
* @param contig the contig name
* @param start the starting position
* @param stop the stop position
*
2009-06-22 22:39:41 +08:00
* @return a new genome loc
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc createGenomeLoc ( String contig , final int start , final int stop ) {
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( contig , getContigIndex ( contig , true ) , start , stop ) ) ;
2009-06-22 22:39:41 +08:00
}
/ * *
* create a genome loc , given a read
*
* @param read
*
* @return
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc createGenomeLoc ( final SAMRecord read ) {
2010-02-17 04:35:35 +08:00
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( read . getReferenceName ( ) , read . getReferenceIndex ( ) , read . getAlignmentStart ( ) , read . getAlignmentEnd ( ) ) ) ;
2009-06-22 22:39:41 +08:00
}
/ * *
* create a new genome loc , given the contig name , and a single position
*
* @param contig the contig name
* @param pos the postion
*
* @return a genome loc representing a single base at the specified postion on the contig
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc createGenomeLoc ( final String contig , final int pos ) {
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( contig , getContigIndex ( contig , true ) , pos , pos ) ) ;
2009-06-22 22:39:41 +08:00
}
/ * *
* verify the specified genome loc is valid , if it ' s not , throw an exception
2009-10-23 03:31:15 +08:00
* Will not verify the location against contig bounds .
2010-04-01 20:47:48 +08:00
*
2010-06-11 04:54:36 +08:00
*
* Validation :
* checks that start and stop are positive , start < stop , and the contig is valid
* does not check that genomeLoc is actually on the contig , so start could be > end of contig
2009-06-22 22:39:41 +08:00
*
2009-10-23 03:31:15 +08:00
* @param toReturn the genome loc we ' re about to return
2009-06-22 22:39:41 +08:00
*
* @return the genome loc if it ' s valid , otherwise we throw an exception
2010-04-01 20:47:48 +08:00
*
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
private GenomeLoc exceptionOnInvalidGenomeLoc ( GenomeLoc toReturn ) {
2009-06-22 22:39:41 +08:00
if ( toReturn . getStart ( ) < 0 ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Parameters to GenomeLocParser are incorrect: the start position is less than 0" ) ;
2009-06-22 22:39:41 +08:00
}
2009-07-01 03:17:24 +08:00
if ( ( toReturn . getStop ( ) ! = - 1 ) & & ( toReturn . getStop ( ) < 0 ) ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Parameters to GenomeLocParser are incorrect: the stop position is less than 0" ) ;
2009-06-22 22:39:41 +08:00
}
if ( toReturn . getContigIndex ( ) < 0 ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Parameters to GenomeLocParser are incorrect: the contig index is less than 0" ) ;
2009-06-22 22:39:41 +08:00
}
if ( toReturn . getContigIndex ( ) > = contigInfo . getSequences ( ) . size ( ) ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count" ) ;
2009-06-22 22:39:41 +08:00
}
return toReturn ;
}
2009-10-23 03:31:15 +08:00
/ * *
* Verify the locus against the bounds of the contig .
2010-04-01 20:47:48 +08:00
*
* performs boundary validation for genome loc INTERVALS :
* start and stop are on contig and start < = stop
* does NOT check that start and stop > 0 , or that contig is valid
* for that reason , this function should only be called AFTER exceptionOnInvalidGenomeLoc ( )
* exceptionOnInvalidGenomeLoc isn ' t included in this function to save time
*
2009-10-23 03:31:15 +08:00
* @param locus Locus to verify .
* /
2010-11-11 01:59:50 +08:00
private void exceptionOnInvalidGenomeLocBounds ( GenomeLoc locus ) {
2009-10-23 03:31:15 +08:00
int contigSize = contigInfo . getSequence ( locus . getContigIndex ( ) ) . getSequenceLength ( ) ;
if ( locus . getStart ( ) > contigSize )
2011-03-18 10:58:29 +08:00
throw new UserException . MalformedGenomeLoc ( "GenomeLoc is invalid: locus start is after the end of contig" , locus ) ;
2009-10-23 03:31:15 +08:00
if ( locus . getStop ( ) > contigSize )
2011-03-18 10:58:29 +08:00
throw new UserException . MalformedGenomeLoc ( "GenomeLoc is invalid: locus stop is after the end of contig" , locus ) ;
2010-04-01 20:47:48 +08:00
if ( locus . getStart ( ) > locus . getStop ( ) ) {
2011-03-18 10:58:29 +08:00
throw new UserException . MalformedGenomeLoc ( "Parameters to GenomeLocParser are incorrect: the start position is greater than the end position" , locus ) ;
2010-04-01 20:47:48 +08:00
}
2009-10-23 03:31:15 +08:00
}
2010-02-17 04:35:35 +08:00
/ * *
* a method for validating genome locs as valid
*
* @param loc the location to validate
*
* @return true if the passed in GenomeLoc represents a valid location
2010-04-01 20:47:48 +08:00
*
* performs interval - style validation : contig is valid and atart and stop less than the end
2010-02-17 04:35:35 +08:00
* /
2010-11-11 01:59:50 +08:00
public boolean validGenomeLoc ( GenomeLoc loc ) {
2010-02-17 04:35:35 +08:00
// quick check before we get the contig size, is the contig number valid
if ( ( loc . getContigIndex ( ) < 0 ) | | // the contig index has to be positive
( loc . getContigIndex ( ) > = contigInfo . getSequences ( ) . size ( ) ) ) // the contig must be in the integer range of contigs)
return false ;
int contigSize = contigInfo . getSequence ( loc . getContigIndex ( ) ) . getSequenceLength ( ) ;
if ( ( loc . getStart ( ) < 0 ) | | // start must be greater than 0
( ( loc . getStop ( ) ! = - 1 ) & & ( loc . getStop ( ) < 0 ) ) | | // the stop can be -1, but no other neg number
( loc . getStart ( ) > contigSize ) | | // the start must be before or equal to the contig end
( loc . getStop ( ) > contigSize ) ) // the stop must also be before or equal to the contig end
return false ;
// we passed
return true ;
}
/ * *
* validate a position or interval on the genome as valid
*
* @param contig the contig name
* @param start the start position
* @param stop the stop position
*
* @return true if it ' s valid , false otherwise
2010-04-01 20:47:48 +08:00
*
* performs interval - style validation : contig is valid and atart and stop less than the end
2010-02-17 04:35:35 +08:00
* /
2010-11-11 01:59:50 +08:00
public boolean validGenomeLoc ( String contig , int start , int stop ) {
return validGenomeLoc ( new GenomeLoc ( contig , getContigIndex ( contig , false ) , start , stop ) ) ;
2010-02-17 04:35:35 +08:00
}
/ * *
* validate a position or interval on the genome as valid
*
* @param contigIndex the contig name
* @param start the start position
* @param stop the stop position
*
* @return true if it ' s valid , false otherwise
2010-04-01 20:47:48 +08:00
*
* performs interval - style validation : contig is valid and atart and stop less than the end
2010-02-17 04:35:35 +08:00
* /
2010-11-11 01:59:50 +08:00
public boolean validGenomeLoc ( int contigIndex , int start , int stop ) {
2010-02-17 04:35:35 +08:00
if ( contigIndex < 0 | | contigIndex > = contigInfo . size ( ) ) return false ;
return validGenomeLoc ( new GenomeLoc ( getSequenceNameFromIndex ( contigIndex ) , contigIndex , start , stop ) ) ;
}
2009-06-22 22:39:41 +08:00
2009-07-01 03:17:24 +08:00
/ * *
* create a new genome loc from an existing loc , with a new start position
2010-03-18 03:39:30 +08:00
* Note that this function will NOT explicitly check the ending offset , in case someone wants to
* set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig .
2009-07-01 03:17:24 +08:00
*
* @param loc the old location
* @param start a new start position
*
* @return the newly created genome loc
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc setStart ( GenomeLoc loc , int start ) {
2010-02-17 04:35:35 +08:00
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , start , loc . getStop ( ) ) ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
* create a new genome loc from an existing loc , with a new stop position
2010-03-18 03:39:30 +08:00
* Note that this function will NOT explicitly check the ending offset , in case someone wants to
* set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig .
2009-07-01 03:17:24 +08:00
*
* @param loc the old location
* @param stop a new stop position
*
* @return
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc setStop ( GenomeLoc loc , int stop ) {
2010-02-17 04:35:35 +08:00
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , loc . start , stop ) ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
* return a new genome loc , with an incremented position
2009-08-21 22:40:57 +08:00
*
2009-07-01 03:17:24 +08:00
* @param loc the old location
2009-08-21 22:40:57 +08:00
*
2009-07-01 03:17:24 +08:00
* @return a new genome loc
2009-06-22 22:39:41 +08:00
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc incPos ( GenomeLoc loc ) {
2009-07-01 03:17:24 +08:00
return incPos ( loc , 1 ) ;
}
/ * *
* return a new genome loc , with an incremented position
2009-08-21 22:40:57 +08:00
*
2009-07-01 03:17:24 +08:00
* @param loc the old location
2009-08-21 22:40:57 +08:00
* @param by how much to move the start and stop by
*
2009-07-01 03:17:24 +08:00
* @return a new genome loc
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc incPos ( GenomeLoc loc , int by ) {
2010-02-17 04:35:35 +08:00
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , loc . start + by , loc . stop + by ) ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
2010-11-11 01:59:50 +08:00
* Creates a GenomeLoc than spans the entire contig .
* @param contigName Name of the contig .
* @return A locus spanning the entire contig .
2009-07-01 03:17:24 +08:00
* /
2010-11-11 01:59:50 +08:00
public GenomeLoc createOverEntireContig ( String contigName ) {
SAMSequenceRecord contig = contigInfo . getSequence ( contigName ) ;
if ( contig = = null )
throw new ReviewedStingException ( "Unable to find contig named " + contigName ) ;
return exceptionOnInvalidGenomeLoc ( new GenomeLoc ( contigName , contig . getSequenceIndex ( ) , 1 , contig . getSequenceLength ( ) ) ) ;
}
2009-06-22 22:39:41 +08:00
}