From 522f8b58be66568a7868adce3dc42da3b218b39c Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 8 May 2009 22:18:04 +0000 Subject: [PATCH] Added second method for getting large sequences of the reference for use in reads traversals. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@645 348d0f76-0448-11de-a6fe-93d51630548a --- .../providers/ReferenceProvider.java | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java index f2e088a49..f61595b01 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import edu.mit.broad.picard.reference.ReferenceSequence; import net.sf.samtools.util.StringUtil; +import net.sf.samtools.SAMRecord; /** * Created by IntelliJ IDEA. @@ -14,10 +15,49 @@ import net.sf.samtools.util.StringUtil; * To change this template use File | Settings | File Templates. */ public class ReferenceProvider { + private IndexedFastaSequenceFile sequenceFile; + private Shard shard; + + // Lazy private ReferenceSequence referenceSequence; private GenomeLoc referenceInterval; public ReferenceProvider( IndexedFastaSequenceFile sequenceFile, Shard shard ) { + this.sequenceFile = sequenceFile; + this.shard = shard; + } + + public char getReferenceBase( GenomeLoc genomeLoc ) throws InvalidPositionException { + if( referenceSequence == null ) + lazyInitializeLocusAccess(); + + validateLocation( genomeLoc ); + int offset = (int)(genomeLoc.getStart() - referenceInterval.getStart()); + return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0); + } + + /** + * Gets the bases of the reference that are aligned to the given read. + * @param read the read for which to extract reference information. + * @return The bases corresponding to this read, or null if the read is unmapped. + */ + public char[] getReferenceBases( SAMRecord read ) { + if( read.getReadUnmappedFlag() ) + return null; + + String contig = read.getReferenceName(); + int start = read.getAlignmentStart(); + int stop = read.getAlignmentEnd(); + + ReferenceSequence alignmentToReference = sequenceFile.getSubsequenceAt( contig, start, stop ); + return StringUtil.bytesToString(alignmentToReference.getBases()).toCharArray(); + } + + /** + * Perform a lazy initialization of access to the locus. Sets up the reference sequence and + * limits the user to work only at that sequence. + */ + private void lazyInitializeLocusAccess() { GenomeLoc position = shard.getGenomeLoc(); this.referenceSequence = sequenceFile.getSubsequenceAt( position.getContig(), position.getStart(), @@ -25,12 +65,6 @@ public class ReferenceProvider { this.referenceInterval = position; } - public char getReferenceBase( GenomeLoc genomeLoc ) throws InvalidPositionException { - validateLocation( genomeLoc ); - int offset = (int)(genomeLoc.getStart() - referenceInterval.getStart()); - return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0); - } - /** * Validates that the genomeLoc is one base wide and is in the reference sequence. * @param genomeLoc location to verify.