From 2836c161eeb62066c5c37c0caf8795212382d82d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 11 Mar 2012 14:45:59 -0400 Subject: [PATCH] Moving trimToVariableRegion out of reduced reads and into a public static ReadClipper function. HaplotypeCaller clips reads to the active region boundries before passing to the HMM. The philosophy of the HC is moving towards genotyping the entire haplotype sequence contained within the active region as a single allele. --- .../utils/activeregion/ActiveRegion.java | 23 ++++++++++-- .../sting/utils/clipping/ReadClipper.java | 36 +++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 6279e0061..c2e69ee2d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -4,6 +4,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -38,14 +39,30 @@ public class ActiveRegion implements HasGenomeLocation { fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); reads.add( read ); } + + public void hardClipToActiveRegion() { + final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, activeRegionLoc.getStart(), activeRegionLoc.getStop() ); + reads.clear(); + reads.addAll(clippedReads); + } public ArrayList getReads() { return reads; } - public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { - return getReference( referenceReader, 0 ); + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + return getActiveRegionReference(referenceReader, 0); } - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( activeRegionLoc.getContig(), + Math.max(1, activeRegionLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(activeRegionLoc.getContig()).getSequenceLength(), activeRegionLoc.getStop() + padding) ).getBases(); + } + + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + return getFullReference(referenceReader, 0); + } + + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), Math.max(1, fullExtentReferenceLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 7a664bd61..1eab43256 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -312,6 +312,42 @@ public class ReadClipper { } + /** + * Hard clip the read to the variable region (from refStart to refStop) + * + * @param read the read to be clipped + * @param refStart the beginning of the variant region (inclusive) + * @param refStop the end of the variant region (inclusive) + * @return the read hard clipped to the variant region + */ + public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) { + final int start = read.getAlignmentStart(); + final int stop = read.getAlignmentEnd(); + + // check if the read is contained in region + if (start <= refStop && stop >= refStart) { + if (start < refStart && stop > refStop) + return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); + else if (start < refStart) + return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); + else if (stop > refStop) + return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); + return read; + } else + return GATKSAMRecord.emptyRead(read); + + } + public static ArrayList hardClipToRegion( final ArrayList reads, final int refStart, final int refStop ) { + final ArrayList returnList = new ArrayList( reads.size() ); + for( final GATKSAMRecord read : reads ) { + final GATKSAMRecord clippedRead = hardClipToRegion( read, refStart, refStop ); + if( !clippedRead.isEmpty() ) { + returnList.add( clippedRead ); + } + } + return returnList; + } + /** * Checks if a read contains adaptor sequences. If it does, hard clips them out. *