2010-04-20 07:00:08 +08:00
/ *
2013-01-11 06:04:08 +08:00
* Copyright ( c ) 2012 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2010-04-20 07:00:08 +08:00
2009-06-22 22:39:41 +08:00
package org.broadinstitute.sting.utils ;
2011-07-18 08:29:58 +08:00
import com.google.java.contract.Ensures ;
import com.google.java.contract.Requires ;
import com.google.java.contract.ThrowEnsures ;
2010-06-10 03:25:02 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2009-06-22 22:39:41 +08:00
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.SAMSequenceDictionary ;
import net.sf.samtools.SAMSequenceRecord ;
import org.apache.log4j.Logger ;
2011-08-04 04:04:51 +08:00
import org.broad.tribble.Feature ;
2010-09-12 23:07:38 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
2010-09-12 22:02:43 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2009-06-22 22:39:41 +08:00
/ * *
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
* Factory class for creating GenomeLocs
2009-06-22 22:39:41 +08:00
* /
2012-05-18 23:38:51 +08:00
public final class GenomeLocParser {
2009-06-22 22:39:41 +08:00
private static Logger logger = Logger . getLogger ( GenomeLocParser . class ) ;
2013-01-29 21:10:56 +08:00
/ * *
* How much validation should we do at runtime with this parser ?
* /
public enum ValidationLevel {
2013-01-30 05:51:39 +08:00
/** Do the standard amount of validation */
2013-01-29 21:10:56 +08:00
STANDARD ,
2013-01-30 05:51:39 +08:00
/** Don't do any real checking at all */
2013-01-29 21:10:56 +08:00
NONE
}
2009-06-22 22:39:41 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Ugly global variable defining the optional ordering of contig elements
//
// --------------------------------------------------------------------------------------------------------------
2012-08-14 03:59:35 +08:00
/ * *
* This single variable holds the underlying SamSequenceDictionary used by the GATK . We assume
* it is thread safe .
* /
final private SAMSequenceDictionary SINGLE_MASTER_SEQUENCE_DICTIONARY ;
/ * *
2013-01-29 21:10:56 +08:00
* A thread - local CachingSequenceDictionary
2012-08-14 03:59:35 +08:00
* /
2013-01-30 05:51:39 +08:00
private final ThreadLocal < MRUCachingSAMSequenceDictionary > contigInfoPerThread =
new ThreadLocal < MRUCachingSAMSequenceDictionary > ( ) {
2013-01-29 21:10:56 +08:00
@Override
2013-01-30 05:51:39 +08:00
protected MRUCachingSAMSequenceDictionary initialValue ( ) {
return new MRUCachingSAMSequenceDictionary ( SINGLE_MASTER_SEQUENCE_DICTIONARY ) ;
2013-01-29 21:10:56 +08:00
}
} ;
/ * *
* How much validation are we doing at runtime with this GenomeLocParser ?
* /
private final ValidationLevel validationLevel ;
2012-08-14 03:59:35 +08:00
/ * *
* @return a caching sequence dictionary appropriate for this thread
* /
2013-01-30 05:51:39 +08:00
private MRUCachingSAMSequenceDictionary getContigInfo ( ) {
2012-08-14 03:59:35 +08:00
return contigInfoPerThread . get ( ) ;
}
2011-04-21 09:31:26 +08:00
2009-06-22 22:39:41 +08:00
/ * *
2010-11-11 01:59:50 +08:00
* set our internal reference contig order
* @param refFile the reference file
2009-06-22 22:39:41 +08:00
* /
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
@Requires ( "refFile != null" )
2010-11-11 01:59:50 +08:00
public GenomeLocParser ( final ReferenceSequenceFile refFile ) {
this ( refFile . getSequenceDictionary ( ) ) ;
}
2013-01-29 21:10:56 +08:00
/ * *
* Create a new GenomeLocParser based on seqDictionary with the standard validation level
* @param seqDict a non - null sequence dictionary
* /
2010-11-11 01:59:50 +08:00
public GenomeLocParser ( SAMSequenceDictionary seqDict ) {
2013-01-29 21:10:56 +08:00
this ( seqDict , ValidationLevel . STANDARD ) ;
}
/ * *
* Create a genome loc parser based on seqDict with the specified level of validation
* @param seqDict the sequence dictionary to use when creating genome locs
2013-01-30 05:51:39 +08:00
* @param validationLevel how much validation should we do of the genome locs at runtime ? Purely for testing purposes
2013-01-29 21:10:56 +08:00
* /
2013-01-30 05:51:39 +08:00
protected GenomeLocParser ( SAMSequenceDictionary seqDict , final ValidationLevel validationLevel ) {
if ( validationLevel = = null )
throw new IllegalArgumentException ( "validation level cannot be null" ) ;
2010-11-11 01:59:50 +08:00
if ( seqDict = = null ) { // we couldn't load the reference dictionary
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
throw new UserException . CommandLineException ( "Failed to load reference dictionary" ) ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
}
2013-01-30 05:51:39 +08:00
this . validationLevel = validationLevel ;
this . SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict ;
2013-01-29 21:10:56 +08:00
if ( logger . isDebugEnabled ( ) ) {
logger . debug ( String . format ( "Prepared reference sequence contig dictionary" ) ) ;
for ( SAMSequenceRecord contig : seqDict . getSequences ( ) ) {
logger . debug ( String . format ( " %s (%d bp)" , contig . getSequenceName ( ) , contig . getSequenceLength ( ) ) ) ;
}
2010-11-11 01:59:50 +08:00
}
2009-06-22 22:39:41 +08:00
}
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
/ * *
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc .
*
2013-01-30 05:51:39 +08:00
* @param contig a potentially null string name for the contig
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
* @return True if the contig is valid . False otherwise .
* /
2013-01-30 05:51:39 +08:00
public final boolean contigIsInDictionary ( final String contig ) {
2012-08-14 03:59:35 +08:00
return contig ! = null & & getContigInfo ( ) . hasContig ( contig ) ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
}
2011-05-21 10:01:59 +08:00
/ * *
* get the contig ' s SAMSequenceRecord
*
* @param contig the string name of the contig
*
* @return the sam sequence record
* /
@Ensures ( "result != null" )
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!contigIsInDictionary(contig) || contig == null" } )
2012-05-18 23:38:51 +08:00
public final SAMSequenceRecord getContigInfo ( final String contig ) {
2011-05-21 10:01:59 +08:00
if ( contig = = null | | ! contigIsInDictionary ( contig ) )
throw new UserException . MalformedGenomeLoc ( String . format ( "Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary" , contig ) ) ;
2012-08-14 03:59:35 +08:00
return getContigInfo ( ) . getSequence ( contig ) ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
}
2009-06-22 22:39:41 +08:00
/ * *
* Returns the contig index of a specified string version of the contig
*
* @param contig the contig string
*
* @return the contig index , - 1 if not found
* /
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
@Ensures ( "result >= 0" )
2011-05-21 10:01:59 +08:00
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!contigIsInDictionary(contig) || contig == null" } )
2012-05-18 23:38:51 +08:00
public final int getContigIndex ( final String contig ) {
2011-05-21 10:01:59 +08:00
return getContigInfo ( contig ) . getSequenceIndex ( ) ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
}
@Requires ( "contig != null" )
protected int getContigIndexWithoutException ( final String contig ) {
2012-08-14 03:59:35 +08:00
if ( contig = = null | | ! getContigInfo ( ) . hasContig ( contig ) )
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
return - 1 ;
2012-08-14 03:59:35 +08:00
return getContigInfo ( ) . getSequenceIndex ( contig ) ;
2009-06-22 22:39:41 +08:00
}
2012-05-18 23:38:51 +08:00
/ * *
* Return the master sequence dictionary used within this GenomeLocParser
* @return
* /
public final SAMSequenceDictionary getContigs ( ) {
2013-01-30 05:51:39 +08:00
return getContigInfo ( ) . getDictionary ( ) ;
2012-05-18 23:38:51 +08:00
}
2011-05-21 10:01:59 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Low-level creation functions
//
// --------------------------------------------------------------------------------------------------------------
2013-01-30 05:51:39 +08:00
2010-06-11 04:54:36 +08:00
/ * *
2013-01-30 05:51:39 +08:00
* @see # createGenomeLoc ( String , int , int , int , boolean ) for exact details of the creation .
2010-04-01 20:47:48 +08:00
*
2013-01-30 05:51:39 +08:00
* Note that because this function doesn ' t take the contig index as an argument for contig , it
* has a slight performance penalty over the version that does take the contig index . Does not
* require the created genome loc on the reference genome
2011-05-21 10:01:59 +08:00
* /
@Ensures ( "result != null" )
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!isValidGenomeLoc(contig, start, stop)" } )
public GenomeLoc createGenomeLoc ( String contig , final int start , final int stop ) {
return createGenomeLoc ( contig , getContigIndex ( contig ) , start , stop ) ;
}
2013-01-30 05:51:39 +08:00
/ * *
* @see # createGenomeLoc ( String , int , int , int , boolean ) for exact details of the creation .
*
* Note that because this function doesn ' t take the contig index as an argument for contig , it
* has a slight performance penalty over the version that does take the contig index .
* /
public GenomeLoc createGenomeLoc ( final String contig , final int start , final int stop , boolean mustBeOnReference ) {
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( contig , getContigIndex ( contig ) , start , stop , mustBeOnReference ) ;
}
2013-01-30 05:51:39 +08:00
/ * *
* @see # createGenomeLoc ( String , int , int , int , boolean ) for exact details of the creation .
*
* Doesn ' t require the start and stop to be on the genome
* /
2011-05-21 10:01:59 +08:00
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!isValidGenomeLoc(contig, start, stop, false)" } )
public GenomeLoc createGenomeLoc ( String contig , int index , final int start , final int stop ) {
return createGenomeLoc ( contig , index , start , stop , false ) ;
}
2013-01-30 05:51:39 +08:00
/ * *
* Create a GenomeLoc on contig , starting at start and ending ( inclusive ) at stop .
*
* @param contig the contig name
* @param index the index into the GATK ' s SAMSequencingDictionary of contig ( passed for efficiency to avoid the lookup )
* @param start the starting position
* @param stop the stop position of this loc , inclusive
* @param mustBeOnReference if true , this factory will throw a UserException . MalformedGenomeLoc if start or stop isn ' t on the contig
*
* @return a non - null GenomeLoc
* /
2011-05-21 10:01:59 +08:00
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)" } )
2013-01-30 05:51:39 +08:00
@Ensures ( "result != null" )
public GenomeLoc createGenomeLoc ( final String contig , int index , final int start , final int stop , boolean mustBeOnReference ) {
2013-01-29 21:10:56 +08:00
// optimization: by interning the string we ensure that future comparisons use == not the full string comp
final String interned = validateGenomeLoc ( contig , index , start , stop , mustBeOnReference ) ;
return new GenomeLoc ( interned , index , start , stop ) ;
2011-05-21 10:01:59 +08:00
}
2013-01-29 21:10:56 +08:00
/ * *
2013-01-30 05:51:39 +08:00
* Create a new GenomeLoc , on contig , including the single position pos .
*
* Pos is not required to be on the reference
*
* @see # createGenomeLoc ( String , int , int , int , boolean ) for exact details of the creation .
*
* @param contig the contig name
* @param pos the start and stop of the created genome loc
*
* @return a genome loc representing a single base at the specified postion on the contig
2013-01-29 21:10:56 +08:00
* /
2013-01-30 05:51:39 +08:00
@Ensures ( "result != null" )
@ThrowEnsures ( { "UserException.MalformedGenomeLoc" , "!isValidGenomeLoc(contig, pos, pos, true)" } )
public GenomeLoc createGenomeLoc ( final String contig , final int pos ) {
return createGenomeLoc ( contig , getContigIndex ( contig ) , pos , pos ) ;
2013-01-03 03:10:55 +08:00
}
2011-05-21 10:01:59 +08:00
/ * *
* validate a position or interval on the genome as valid
2010-04-01 20:47:48 +08:00
*
2011-05-21 10:01:59 +08:00
* Requires that contig exist in the master sequence dictionary , and that contig index be valid as well . Requires
* that start < = stop .
*
* if mustBeOnReference is true ,
* performs boundary validation for genome loc INTERVALS :
* start and stop are on contig and start < = stop
2010-04-01 20:47:48 +08:00
*
2011-05-21 10:01:59 +08:00
* @param contig the contig name
* @param start the start position
* @param stop the stop position
*
2013-01-29 21:10:56 +08:00
* @return the interned contig name , an optimization that ensures that contig = = the string in the sequence dictionary
2009-09-22 09:32:35 +08:00
* /
2013-01-29 21:10:56 +08:00
protected String validateGenomeLoc ( final String contig , final int contigIndex , final int start , final int stop , final boolean mustBeOnReference ) {
if ( validationLevel = = ValidationLevel . NONE )
return contig ;
else {
if ( stop < start )
vglHelper ( String . format ( "The stop position %d is less than start %d in contig %s" , stop , start , contig ) ) ;
final SAMSequenceRecord contigInfo = getContigInfo ( ) . getSequence ( contig ) ;
if ( contigInfo . getSequenceIndex ( ) ! = contigIndex )
vglHelper ( String . format ( "The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s" ,
contigIndex , contigInfo . getSequenceIndex ( ) , contig ) ) ;
if ( mustBeOnReference ) {
if ( start < 1 )
vglHelper ( String . format ( "The start position %d is less than 1" , start ) ) ;
if ( stop < 1 )
vglHelper ( String . format ( "The stop position %d is less than 1" , stop ) ) ;
final int contigSize = contigInfo . getSequenceLength ( ) ;
if ( start > contigSize | | stop > contigSize )
vglHelper ( String . format ( "The genome loc coordinates %d-%d exceed the contig size (%d)" , start , stop , contigSize ) ) ;
}
2011-05-21 10:01:59 +08:00
2013-01-29 21:10:56 +08:00
return contigInfo . getSequenceName ( ) ;
2011-05-21 10:01:59 +08:00
}
}
2013-01-29 21:10:56 +08:00
/ * *
* Would a genome loc created with the given parameters be valid w . r . t . the master sequence dictionary ?
* @param contig the contig we ' d use
* @param start the start position
* @param stop the stop
* @param mustBeOnReference should we require the resulting genome loc to be completely on the reference genome ?
* @return true if this would produce a valid genome loc , false otherwise
* /
2011-05-21 10:01:59 +08:00
public boolean isValidGenomeLoc ( String contig , int start , int stop , boolean mustBeOnReference ) {
2013-01-29 21:10:56 +08:00
try {
validateGenomeLoc ( contig , getContigIndexWithoutException ( contig ) , start , stop , mustBeOnReference ) ;
return true ;
} catch ( ReviewedStingException e ) {
return false ;
}
2011-05-21 10:01:59 +08:00
}
2013-01-29 21:10:56 +08:00
/ * *
* @see # isValidGenomeLoc ( String , int , int ) with mustBeOnReference = = true
* /
2011-05-21 10:01:59 +08:00
public boolean isValidGenomeLoc ( String contig , int start , int stop ) {
2013-01-29 21:10:56 +08:00
return isValidGenomeLoc ( contig , start , stop , true ) ;
2011-05-21 10:01:59 +08:00
}
2013-01-29 21:10:56 +08:00
private void vglHelper ( final String msg ) {
throw new UserException . MalformedGenomeLoc ( "Parameters to GenomeLocParser are incorrect:" + msg ) ;
2011-05-21 10:01:59 +08:00
}
// --------------------------------------------------------------------------------------------------------------
//
// Parsing genome locs
//
// --------------------------------------------------------------------------------------------------------------
2009-09-22 09:32:35 +08:00
2009-06-22 22:39:41 +08:00
/ * *
2011-05-21 10:01:59 +08:00
* parse a genome interval , from a location string
2010-06-11 04:54:36 +08:00
*
2011-05-21 10:01:59 +08:00
* Performs interval - style validation :
2009-06-22 22:39:41 +08:00
*
2011-05-21 10:01:59 +08:00
* contig is valid ; start and stop less than the end ; start < = stop , and start / stop are on the contig
2009-06-22 22:39:41 +08:00
* @param str the string to parse
*
* @return a GenomeLoc representing the String
2010-04-01 20:47:48 +08:00
*
2009-06-22 22:39:41 +08:00
* /
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
@Requires ( "str != null" )
@Ensures ( "result != null" )
2010-11-11 01:59:50 +08:00
public GenomeLoc parseGenomeLoc ( final String str ) {
2009-06-22 22:39:41 +08:00
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
2010-06-11 04:54:36 +08:00
2009-06-22 22:39:41 +08:00
String contig = null ;
2010-11-11 01:59:50 +08:00
int start = 1 ;
int stop = - 1 ;
2010-06-11 04:54:36 +08:00
2012-11-28 00:00:33 +08:00
final int colonIndex = str . lastIndexOf ( ":" ) ;
2010-06-11 04:54:36 +08:00
if ( colonIndex = = - 1 ) {
contig = str . substring ( 0 , str . length ( ) ) ; // chr1
stop = Integer . MAX_VALUE ;
} else {
contig = str . substring ( 0 , colonIndex ) ;
final int dashIndex = str . indexOf ( '-' , colonIndex ) ;
try {
if ( dashIndex = = - 1 ) {
if ( str . charAt ( str . length ( ) - 1 ) = = '+' ) {
start = parsePosition ( str . substring ( colonIndex + 1 , str . length ( ) - 1 ) ) ; // chr:1+
stop = Integer . MAX_VALUE ;
} else {
start = parsePosition ( str . substring ( colonIndex + 1 ) ) ; // chr1:1
stop = start ;
}
} else {
start = parsePosition ( str . substring ( colonIndex + 1 , dashIndex ) ) ; // chr1:1-1
stop = parsePosition ( str . substring ( dashIndex + 1 ) ) ;
2010-03-11 00:25:16 +08:00
}
2010-06-11 04:54:36 +08:00
} catch ( Exception e ) {
2010-09-12 22:02:43 +08:00
throw new UserException ( "Failed to parse Genome Location string: " + str , e ) ;
2010-03-11 00:25:16 +08:00
}
2009-06-22 22:39:41 +08:00
}
2010-04-01 20:47:48 +08:00
// is the contig valid?
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
if ( ! contigIsInDictionary ( contig ) )
2011-05-21 10:01:59 +08:00
throw new UserException . MalformedGenomeLoc ( "Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?" ) ;
2009-09-22 06:37:47 +08:00
2010-11-11 01:59:50 +08:00
if ( stop = = Integer . MAX_VALUE )
2009-06-22 22:39:41 +08:00
// lookup the actually stop position!
stop = getContigInfo ( contig ) . getSequenceLength ( ) ;
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( contig , getContigIndex ( contig ) , start , stop , true ) ;
2009-06-22 22:39:41 +08:00
}
2010-06-11 04:54:36 +08:00
/ * *
* Parses a number like 1 , 000 , 000 into a long .
* @param pos
* /
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
@Requires ( "pos != null" )
@Ensures ( "result >= 0" )
2013-01-30 05:51:39 +08:00
protected int parsePosition ( final String pos ) {
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
if ( pos . indexOf ( '-' ) ! = - 1 ) {
throw new NumberFormatException ( "Position: '" + pos + "' can't contain '-'." ) ;
}
2010-06-11 04:54:36 +08:00
if ( pos . indexOf ( ',' ) ! = - 1 ) {
final StringBuilder buffer = new StringBuilder ( ) ;
for ( int i = 0 ; i < pos . length ( ) ; i + + ) {
final char c = pos . charAt ( i ) ;
if ( c = = ',' ) {
continue ;
} else if ( c < '0' | | c > '9' ) {
throw new NumberFormatException ( "Position: '" + pos + "' contains invalid chars." ) ;
2010-11-11 01:59:50 +08:00
} else {
2010-06-11 04:54:36 +08:00
buffer . append ( c ) ;
}
}
2010-11-11 01:59:50 +08:00
return Integer . parseInt ( buffer . toString ( ) ) ;
2010-06-11 04:54:36 +08:00
} else {
2010-11-11 01:59:50 +08:00
return Integer . parseInt ( pos ) ;
2010-06-11 04:54:36 +08:00
}
2009-06-22 22:39:41 +08:00
}
2011-05-21 10:01:59 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Parsing string representations
//
// --------------------------------------------------------------------------------------------------------------
2009-06-22 22:39:41 +08:00
/ * *
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
* create a genome loc , given a read . If the read is unmapped , * and * yet the read has a contig and start position ,
* then a GenomeLoc is returned for contig : start - start , otherwise and UNMAPPED GenomeLoc is returned .
2009-06-22 22:39:41 +08:00
*
* @param read
*
* @return
* /
2011-05-21 10:01:59 +08:00
@Requires ( "read != null" )
@Ensures ( "result != null" )
2010-11-11 01:59:50 +08:00
public GenomeLoc createGenomeLoc ( final SAMRecord read ) {
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
if ( read . getReadUnmappedFlag ( ) & & read . getReferenceIndex ( ) = = - 1 )
// read is unmapped and not placed anywhere on the genome
2011-05-21 10:01:59 +08:00
return GenomeLoc . UNMAPPED ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
else {
2011-06-04 02:06:41 +08:00
// Use Math.max to ensure that end >= start (Picard assigns the end to reads that are entirely within an insertion as start-1)
2011-06-03 04:40:56 +08:00
int end = read . getReadUnmappedFlag ( ) ? read . getAlignmentStart ( ) : Math . max ( read . getAlignmentEnd ( ) , read . getAlignmentStart ( ) ) ;
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( read . getReferenceName ( ) , read . getReferenceIndex ( ) , read . getAlignmentStart ( ) , end , false ) ;
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
}
2009-06-22 22:39:41 +08:00
}
2011-08-04 04:04:51 +08:00
/ * *
* Creates a GenomeLoc from a Tribble feature
* @param feature
* @return
* /
public GenomeLoc createGenomeLoc ( final Feature feature ) {
return createGenomeLoc ( feature . getChr ( ) , feature . getStart ( ) , feature . getEnd ( ) ) ;
}
2011-11-10 23:58:40 +08:00
/ * *
2013-01-30 05:51:39 +08:00
* @see GenomeLoc . setStart
2009-07-01 03:17:24 +08:00
* /
2013-01-30 05:51:39 +08:00
@Deprecated
public GenomeLoc setStart ( final GenomeLoc loc , final int start ) {
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , start , loc . getStop ( ) ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
2013-01-30 05:51:39 +08:00
* @see GenomeLoc . setStop
2009-07-01 03:17:24 +08:00
* /
2013-01-30 05:51:39 +08:00
@Deprecated
public GenomeLoc setStop ( final GenomeLoc loc , final int stop ) {
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , loc . start , stop ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
2013-01-30 05:51:39 +08:00
* @see GenomeLoc . incPos
2009-06-22 22:39:41 +08:00
* /
2013-01-30 05:51:39 +08:00
@Deprecated
public GenomeLoc incPos ( final GenomeLoc loc ) {
2009-07-01 03:17:24 +08:00
return incPos ( loc , 1 ) ;
}
/ * *
2013-01-30 05:51:39 +08:00
* @see GenomeLoc . incPos
2009-07-01 03:17:24 +08:00
* /
2013-01-30 05:51:39 +08:00
@Deprecated
public GenomeLoc incPos ( final GenomeLoc loc , final int by ) {
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( loc . getContig ( ) , loc . getContigIndex ( ) , loc . start + by , loc . stop + by ) ;
2009-07-01 03:17:24 +08:00
}
/ * *
2010-11-11 01:59:50 +08:00
* Creates a GenomeLoc than spans the entire contig .
* @param contigName Name of the contig .
* @return A locus spanning the entire contig .
2009-07-01 03:17:24 +08:00
* /
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
@Requires ( "contigName != null" )
@Ensures ( "result != null" )
2013-01-30 05:51:39 +08:00
public GenomeLoc createOverEntireContig ( final String contigName ) {
2012-08-14 03:59:35 +08:00
SAMSequenceRecord contig = getContigInfo ( ) . getSequence ( contigName ) ;
2011-05-21 10:01:59 +08:00
return createGenomeLoc ( contigName , contig . getSequenceIndex ( ) , 1 , contig . getSequenceLength ( ) , true ) ;
}
2011-11-18 02:53:46 +08:00
/ * *
* Creates a loc to the left ( starting at the loc start + 1 ) of maxBasePairs size .
* @param loc The original loc
* @param maxBasePairs The maximum number of basePairs
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig .
* /
@Requires ( { "loc != null" , "maxBasePairs > 0" } )
2013-01-30 05:51:39 +08:00
public GenomeLoc createGenomeLocAtStart ( final GenomeLoc loc , final int maxBasePairs ) {
2011-11-18 02:53:46 +08:00
if ( GenomeLoc . isUnmapped ( loc ) )
return null ;
2013-01-30 05:51:39 +08:00
final String contigName = loc . getContig ( ) ;
final SAMSequenceRecord contig = getContigInfo ( ) . getSequence ( contigName ) ;
final int contigIndex = contig . getSequenceIndex ( ) ;
2011-11-18 02:53:46 +08:00
int start = loc . getStart ( ) - maxBasePairs ;
int stop = loc . getStart ( ) - 1 ;
if ( start < 1 )
start = 1 ;
if ( stop < 1 )
return null ;
return createGenomeLoc ( contigName , contigIndex , start , stop , true ) ;
}
2012-06-19 09:36:27 +08:00
/ * *
* Creates a loc padded in both directions by maxBasePairs size ( if possible ) .
* @param loc The original loc
* @param padding The number of base pairs to pad on either end
* @return The contiguous loc of length up to the original length + 2 * padding ( depending on the start / end of the contig ) .
* /
2013-01-30 05:51:39 +08:00
@Requires ( { "loc != null" , "padding >= 0" } )
2012-06-19 09:36:27 +08:00
public GenomeLoc createPaddedGenomeLoc ( final GenomeLoc loc , final int padding ) {
2013-01-30 05:51:39 +08:00
if ( GenomeLoc . isUnmapped ( loc ) | | padding = = 0 )
2012-06-19 09:36:27 +08:00
return loc ;
2013-01-30 05:51:39 +08:00
else
return createGenomeLocOnContig ( loc . getContig ( ) , loc . getContigIndex ( ) , loc . getStart ( ) - padding , loc . getStop ( ) + padding ) ;
2012-06-19 09:36:27 +08:00
}
2011-11-18 02:53:46 +08:00
/ * *
* Creates a loc to the right ( starting at the loc stop + 1 ) of maxBasePairs size .
* @param loc The original loc
* @param maxBasePairs The maximum number of basePairs
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig .
* /
@Requires ( { "loc != null" , "maxBasePairs > 0" } )
2013-01-30 05:51:39 +08:00
public GenomeLoc createGenomeLocAtStop ( final GenomeLoc loc , final int maxBasePairs ) {
2011-11-18 02:53:46 +08:00
if ( GenomeLoc . isUnmapped ( loc ) )
return null ;
String contigName = loc . getContig ( ) ;
2012-08-14 03:59:35 +08:00
SAMSequenceRecord contig = getContigInfo ( ) . getSequence ( contigName ) ;
2011-11-18 02:53:46 +08:00
int contigIndex = contig . getSequenceIndex ( ) ;
int contigLength = contig . getSequenceLength ( ) ;
int start = loc . getStop ( ) + 1 ;
int stop = loc . getStop ( ) + maxBasePairs ;
if ( start > contigLength )
return null ;
if ( stop > contigLength )
stop = contigLength ;
return createGenomeLoc ( contigName , contigIndex , start , stop , true ) ;
}
2013-01-30 05:51:39 +08:00
/ * *
* @see # createGenomeLocOnContig ( String , int , int , int ) with the contig index looked up from contig
* /
public GenomeLoc createGenomeLocOnContig ( final String contig , final int start , final int stop ) {
return createGenomeLocOnContig ( contig , getContigIndex ( contig ) , start , stop ) ;
}
/ * *
* Create a new genome loc , bounding start and stop by the start and end of contig
*
* This function will return null if start and stop cannot be adjusted in any reasonable way
* to be on the contig . For example , if start and stop are both past the end of the contig ,
* there ' s no way to fix this , and null will be returned .
*
* @param contig our contig
* @param start our start as an arbitrary integer ( may be negative , etc )
* @param stop our stop as an arbitrary integer ( may be negative , etc )
* @return a valid genome loc over contig , or null if a meaningful genome loc cannot be created
* /
public GenomeLoc createGenomeLocOnContig ( final String contig , final int contigIndex , final int start , final int stop ) {
final int contigLength = getContigInfo ( ) . getSequence ( contigIndex ) . getSequenceLength ( ) ;
final int boundedStart = Math . max ( 1 , start ) ;
final int boundedStop = Math . min ( contigLength , stop ) ;
if ( boundedStart > contigLength | | boundedStop < 1 )
// there's no meaningful way to create this genome loc, as the start and stop are off the contig
return null ;
else
return createGenomeLoc ( contig , contigIndex , boundedStart , boundedStop ) ;
}
2009-06-22 22:39:41 +08:00
}