diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 61f137fb4..d2d9d76b1 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.picard.filter.SamRecordFilter; -import net.sf.picard.filter.FilteringIterator; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.sam.MergingSamRecordIterator; @@ -103,9 +102,17 @@ public class SAMDataSource implements SimpleDataSource { private final boolean hasReadGroupCollisions; /** - * Maps the SAM readers' original read group ids to their revised ids. + * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids + * are always unique, we can simply use a map here, no need to stratify by reader. */ - private final Map mergedReadGroupMappings = new HashMap(); + private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); + + /** + * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified + * by readers, since there can be readgroup id collision: different bam files (readers) can list the + * same read group id, which will be disambiguated when these input streams are merged. + */ + private final Map originalToMergedReadGroupMappings = new HashMap(); /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -209,20 +216,24 @@ public class SAMDataSource implements SimpleDataSource { generateExtendedEvents ); - // cache the read group id (original) -> read group id (merged) mapping. + // cache the read group id (original) -> read group id (merged) + // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { SAMFileReader reader = readers.getReader(id); - ReadGroupMapping mapping = new ReadGroupMapping(); + ReadGroupMapping mappingToMerged = new ReadGroupMapping(); List readGroups = reader.getFileHeader().getReadGroups(); for(SAMReadGroupRecord readGroup: readGroups) { - if(headerMerger.hasReadGroupCollisions()) - mapping.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId())); - else - mapping.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + if(headerMerger.hasReadGroupCollisions()) { + mappingToMerged.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId())); + mergedToOriginalReadGroupMappings.put(headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()),readGroup.getReadGroupId()); + } else { + mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + } } - mergedReadGroupMappings.put(id,mapping); + originalToMergedReadGroupMappings.put(id,mappingToMerged); } resourcePool.releaseReaders(readers); @@ -296,7 +307,17 @@ public class SAMDataSource implements SimpleDataSource { * @return Merged read group ID. */ public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { - return mergedReadGroupMappings.get(reader).get(originalReadGroupId); + return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); + } + + /** + * Gets the original read group id (as it was specified in the original input bam file) that maps onto + * this 'merged' read group id. + * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). + * @return Merged read group ID. + */ + public String getOriginalReadGroupId(final String mergedReadGroupId) { + return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 9e96e15c4..3b2e50cd2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -143,6 +143,9 @@ public class IndelRealigner extends ReadWalker { "the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" + "for each input file name (1st column).") protected String N_WAY_OUT = null; + @Hidden + @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") + protected boolean CHECKEARLY = false; // DEBUGGING OPTIONS FOLLOW @@ -198,6 +201,10 @@ public class IndelRealigner extends ReadWalker { protected Map nwayWriters = null; + // debug info for lazy SW evaluation: + private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had + private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment + private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) public void initialize() { if ( LOD_THRESHOLD < 0.0 ) @@ -373,6 +380,11 @@ public class IndelRealigner extends ReadWalker { if ( N_WAY_OUT != null ) { SAMReaderID rid = getToolkit().getReaderIDForRead(read); SAMFileWriter w = nwayWriters.get(rid); + // reset read's read group from merged to original if read group id collision has happened in merging: + if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) { + read.setAttribute("RG", + getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG"))); + } w.addAlignment(read); } } @@ -387,6 +399,11 @@ public class IndelRealigner extends ReadWalker { // in initialize() we ensured that every reader has exactly one tag, so the following line is safe: SAMReaderID rid = getToolkit().getReaderIDForRead(read); SAMFileWriter w = nwayWriters.get(rid); + // reset read's read group from merged to original if read group id collision has happened in merging: + if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) { + read.setAttribute("RG", + getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG"))); + } w.addAlignment(read); } @@ -525,6 +542,13 @@ public class IndelRealigner extends ReadWalker { if ( N_WAY_OUT != null ) { for ( SAMFileWriter w : nwayWriters.values() ) w.close(); } + if ( CHECKEARLY ) { + logger.info("SW alignments runs: "+SWalignmentRuns); + logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); + logger.info("SW alignments skipped (perfect match): "+exactMatchesFound); + logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+ + " ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)"); + } } private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { @@ -591,7 +615,7 @@ public class IndelRealigner extends ReadWalker { // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference if ( !USE_KNOWN_INDELS_ONLY ) - generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference); + generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); @@ -839,12 +863,17 @@ public class IndelRealigner extends ReadWalker { return totalRawMismatchSum; } - private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, final Set altConsensesToPopulate, final byte[] reference) { + private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, + final Set altConsensesToPopulate, + final byte[] reference, + final int leftmostIndex) { // if we are under the limit, use all reads to generate alternate consenses if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { - for ( AlignedRead aRead : altAlignmentsToTest ) - createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + for ( AlignedRead aRead : altAlignmentsToTest ) { + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } } // otherwise, choose reads for alternate consenses randomly else { @@ -852,12 +881,14 @@ public class IndelRealigner extends ReadWalker { while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { int index = generator.nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); - createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); } } } private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { + // do a pairwise alignment against the reference SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); @@ -865,11 +896,33 @@ public class IndelRealigner extends ReadWalker { altConsensesToPopulate.add(c); } + private void createAndAddAlternateConsensus1(AlignedRead read, final Set altConsensesToPopulate, + final byte[] reference, final int leftmostIndex) { + + for ( Consensus known : altConsensesToPopulate ) { + Pair altAlignment = findBestOffset(known.str, read, leftmostIndex); + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer + } + // do a pairwise alignment against the reference + SWalignmentRuns++; + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); + if ( c != null ) { + altConsensesToPopulate.add(c); + SWalignmentSuccess++; + } + } + // create a Consensus from cigar/read strings which originate somewhere on the reference private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { if ( indexOnRef < 0 ) return null; + // if there are no indels, we do not need this consensus, can abort early: + if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null; + // create the new consensus ArrayList elements = new ArrayList(c.numCigarElements()-1); StringBuilder sb = new StringBuilder();