From 4d6312d4ea1e22f53649c4c38fd0458753effe0f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 22 Jan 2012 14:31:01 -0500 Subject: [PATCH] HaplotypeCaller is now an ActiveRegionWalker. --- .../traversals/TraverseActiveRegions.java | 24 +++---- .../gatk/walkers/ActiveRegionWalker.java | 4 ++ .../sting/utils/activeregion/ActiveRead.java | 19 ------ .../utils/activeregion/ActiveRegion.java | 24 +++---- .../sting/utils/fragments/FragmentUtils.java | 65 +++++++++++++++++++ 5 files changed, 95 insertions(+), 41 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRead.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index b18d1ceb9..ebfcc0c29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -46,7 +46,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList ) { final ArrayList returnList = new ArrayList(); if( activeList.size() == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 8405f762d..d7e170d73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -37,6 +37,10 @@ public abstract class ActiveRegionWalker extends Walker reads = new ArrayList(); - private byte[] reference = null; + private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; private final GenomeLoc extendedLoc; private final int extension; @@ -35,28 +34,31 @@ public class ActiveRegion implements HasGenomeLocation { } // add each read to the bin and extend the reference genome activeRegionLoc if needed - public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { + public void add( final GATKSAMRecord read ) { fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); - reads.add( new ActiveRead(read, isPrimaryRegion) ); + reads.add( read ); } - public ArrayList getReads() { return reads; } + public ArrayList getReads() { return reads; } public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { - // set up the reference if we haven't done so yet - if ( reference == null ) { - reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases(); - } + return getReference( referenceReader, 0 ); + } - return reference; + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), + Math.max(1, fullExtentReferenceLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } @Override public GenomeLoc getLocation() { return activeRegionLoc; } - public GenomeLoc getExtendedLoc() { return extendedLoc; } public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } public int getExtension() { return extension; } public int size() { return reads.size(); } + public void clearReads() { reads.clear(); } + public void remove( final GATKSAMRecord read ) { reads.remove( read ); } + public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index e5500ca21..68bf6dce8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -1,9 +1,15 @@ package org.broadinstitute.sting.utils.fragments; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -121,4 +127,63 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } + public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + final byte MIN_QUAL_BAD_OVERLAP = 16; + if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } + + GATKSAMRecord firstRead = overlappingPair.get(0); + GATKSAMRecord secondRead = overlappingPair.get(1); + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + firstRead = overlappingPair.get(1); + secondRead = overlappingPair.get(0); + } + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + } + if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { + return overlappingPair; // fragments contain indels so don't merge them + } + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + + final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); + final int numBases = firstReadStop + secondRead.getReadLength(); + final byte[] bases = new byte[numBases]; + final byte[] quals = new byte[numBases]; + final byte[] firstReadBases = firstRead.getReadBases(); + final byte[] firstReadQuals = firstRead.getBaseQualities(); + final byte[] secondReadBases = secondRead.getReadBases(); + final byte[] secondReadQuals = secondRead.getBaseQualities(); + for(int iii = 0; iii < firstReadStop; iii++) { + bases[iii] = firstReadBases[iii]; + quals[iii] = firstReadQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { + return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + } + bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); + quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + bases[iii] = secondReadBases[iii-firstReadStop]; + quals[iii] = secondReadQuals[iii-firstReadStop]; + } + + final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader()); + returnRead.setAlignmentStart(firstRead.getUnclippedStart()); + returnRead.setReadBases( bases ); + returnRead.setBaseQualities( quals ); + returnRead.setReadGroup( firstRead.getReadGroup() ); + returnRead.setReferenceName( firstRead.getReferenceName() ); + final CigarElement c = new CigarElement(bases.length, CigarOperator.M); + final ArrayList cList = new ArrayList(); + cList.add(c); + returnRead.setCigar( new Cigar( cList )); + returnRead.setMappingQuality( firstRead.getMappingQuality() ); + + final ArrayList returnList = new ArrayList(); + returnList.add(returnRead); + return returnList; + } }