HaplotypeCaller is now an ActiveRegionWalker.

This commit is contained in:
Ryan Poplin 2012-01-22 14:31:01 -05:00
parent 3b1aad4f17
commit 4d6312d4ea
5 changed files with 95 additions and 41 deletions

View File

@ -46,7 +46,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
@ -166,20 +166,22 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
bestRegion = otherRegionToTest;
}
}
bestRegion.add( (GATKSAMRecord) read, true );
bestRegion.add( (GATKSAMRecord) read );
// The read is also added to all other regions in which it overlaps but marked as non-primary
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( (GATKSAMRecord) read, false );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read, false );
if( walker.wantsNonPrimaryReads() ) {
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( (GATKSAMRecord) read );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read );
}
}
}
placedReads.add( read );
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read, false );
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) {
activeRegion.add( (GATKSAMRecord) read );
}
}
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
@ -207,7 +209,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
// integrate active regions into contiguous chunks based on active status
// integrate active regions into contiguous chunks with identical active status
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<ActiveRegion> activeList ) {
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
if( activeList.size() == 0 ) {

View File

@ -37,6 +37,10 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
return true; // We are keeping all the reads
}
public boolean wantsNonPrimaryReads() {
return false;
}
// Determine active status over the AlignmentContext
public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);

View File

@ -1,19 +0,0 @@
package org.broadinstitute.sting.utils.activeregion;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 1/4/12
*/
public class ActiveRead {
final public GATKSAMRecord read;
final public boolean isPrimaryRegion;
ActiveRead( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
this.read = read;
this.isPrimaryRegion = isPrimaryRegion;
}
}

View File

@ -16,8 +16,7 @@ import java.util.ArrayList;
public class ActiveRegion implements HasGenomeLocation {
private final ArrayList<ActiveRead> reads = new ArrayList<ActiveRead>();
private byte[] reference = null;
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
private final GenomeLoc activeRegionLoc;
private final GenomeLoc extendedLoc;
private final int extension;
@ -35,28 +34,31 @@ public class ActiveRegion implements HasGenomeLocation {
}
// add each read to the bin and extend the reference genome activeRegionLoc if needed
public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
public void add( final GATKSAMRecord read ) {
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
reads.add( new ActiveRead(read, isPrimaryRegion) );
reads.add( read );
}
public ArrayList<ActiveRead> getReads() { return reads; }
public ArrayList<GATKSAMRecord> getReads() { return reads; }
public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) {
// set up the reference if we haven't done so yet
if ( reference == null ) {
reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases();
}
return getReference( referenceReader, 0 );
}
return reference;
public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(),
Math.max(1, fullExtentReferenceLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
}
@Override
public GenomeLoc getLocation() { return activeRegionLoc; }
public GenomeLoc getExtendedLoc() { return extendedLoc; }
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
public int getExtension() { return extension; }
public int size() { return reads.size(); }
public void clearReads() { reads.clear(); }
public void remove( final GATKSAMRecord read ) { reads.remove( read ); }
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
}

View File

@ -1,9 +1,15 @@
package org.broadinstitute.sting.utils.fragments;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
@ -121,4 +127,63 @@ public class FragmentUtils {
return create(reads, reads.size(), SamRecordGetter);
}
public final static List<GATKSAMRecord> mergeOverlappingPairedFragments( List<GATKSAMRecord> overlappingPair ) {
final byte MIN_QUAL_BAD_OVERLAP = 16;
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }
GATKSAMRecord firstRead = overlappingPair.get(0);
GATKSAMRecord secondRead = overlappingPair.get(1);
if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
firstRead = overlappingPair.get(1);
secondRead = overlappingPair.get(0);
}
if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
}
if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) {
return overlappingPair; // fragments contain indels so don't merge them
}
final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart());
final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );
final int numBases = firstReadStop + secondRead.getReadLength();
final byte[] bases = new byte[numBases];
final byte[] quals = new byte[numBases];
final byte[] firstReadBases = firstRead.getReadBases();
final byte[] firstReadQuals = firstRead.getBaseQualities();
final byte[] secondReadBases = secondRead.getReadBases();
final byte[] secondReadQuals = secondRead.getBaseQualities();
for(int iii = 0; iii < firstReadStop; iii++) {
bases[iii] = firstReadBases[iii];
quals[iii] = firstReadQuals[iii];
}
for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) {
if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) {
return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them
}
bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] );
quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] );
}
for(int iii = firstRead.getReadLength(); iii < numBases; iii++) {
bases[iii] = secondReadBases[iii-firstReadStop];
quals[iii] = secondReadQuals[iii-firstReadStop];
}
final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader());
returnRead.setAlignmentStart(firstRead.getUnclippedStart());
returnRead.setReadBases( bases );
returnRead.setBaseQualities( quals );
returnRead.setReadGroup( firstRead.getReadGroup() );
returnRead.setReferenceName( firstRead.getReferenceName() );
final CigarElement c = new CigarElement(bases.length, CigarOperator.M);
final ArrayList<CigarElement> cList = new ArrayList<CigarElement>();
cList.add(c);
returnRead.setCigar( new Cigar( cList ));
returnRead.setMappingQuality( firstRead.getMappingQuality() );
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
returnList.add(returnRead);
return returnList;
}
}