Added method getOriginalReadGroupId(): takes merged (in case of collision) read group id as reported by a read coming from the merged stream and returns this read's read group id as it was listed in the original input bam file.
IndelRealigner now uses this functionality to correctly un-mangle read group id's in --nWayOut mode (i.e. when we need to write reads into separate output bams with headers matching the original inputs). Some hidden changes to IndelRealigner: purely testing and development, transparent to the users (hidden option added) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4775 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e5282742f9
commit
4e62de4213
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.picard.sam.MergingSamRecordIterator;
|
||||
|
||||
|
|
@ -103,9 +102,17 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
private final boolean hasReadGroupCollisions;
|
||||
|
||||
/**
|
||||
* Maps the SAM readers' original read group ids to their revised ids.
|
||||
* Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids
|
||||
* are always unique, we can simply use a map here, no need to stratify by reader.
|
||||
*/
|
||||
private final Map<SAMReaderID,ReadGroupMapping> mergedReadGroupMappings = new HashMap<SAMReaderID,ReadGroupMapping>();
|
||||
private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping();
|
||||
|
||||
/**
|
||||
* Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified
|
||||
* by readers, since there can be readgroup id collision: different bam files (readers) can list the
|
||||
* same read group id, which will be disambiguated when these input streams are merged.
|
||||
*/
|
||||
private final Map<SAMReaderID,ReadGroupMapping> originalToMergedReadGroupMappings = new HashMap<SAMReaderID,ReadGroupMapping>();
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
|
@ -209,20 +216,24 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
generateExtendedEvents
|
||||
);
|
||||
|
||||
// cache the read group id (original) -> read group id (merged) mapping.
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
// and read group id (merged) -> read group id (original) mappings.
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
SAMFileReader reader = readers.getReader(id);
|
||||
ReadGroupMapping mapping = new ReadGroupMapping();
|
||||
ReadGroupMapping mappingToMerged = new ReadGroupMapping();
|
||||
|
||||
List<SAMReadGroupRecord> readGroups = reader.getFileHeader().getReadGroups();
|
||||
for(SAMReadGroupRecord readGroup: readGroups) {
|
||||
if(headerMerger.hasReadGroupCollisions())
|
||||
mapping.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()));
|
||||
else
|
||||
mapping.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
|
||||
if(headerMerger.hasReadGroupCollisions()) {
|
||||
mappingToMerged.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()));
|
||||
mergedToOriginalReadGroupMappings.put(headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()),readGroup.getReadGroupId());
|
||||
} else {
|
||||
mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
|
||||
mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
|
||||
}
|
||||
}
|
||||
|
||||
mergedReadGroupMappings.put(id,mapping);
|
||||
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
||||
}
|
||||
|
||||
resourcePool.releaseReaders(readers);
|
||||
|
|
@ -296,7 +307,17 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* @return Merged read group ID.
|
||||
*/
|
||||
public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) {
|
||||
return mergedReadGroupMappings.get(reader).get(originalReadGroupId);
|
||||
return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the original read group id (as it was specified in the original input bam file) that maps onto
|
||||
* this 'merged' read group id.
|
||||
* @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream).
|
||||
* @return Merged read group ID.
|
||||
*/
|
||||
public String getOriginalReadGroupId(final String mergedReadGroupId) {
|
||||
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -143,6 +143,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
"the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" +
|
||||
"for each input file name (1st column).")
|
||||
protected String N_WAY_OUT = null;
|
||||
@Hidden
|
||||
@Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses")
|
||||
protected boolean CHECKEARLY = false;
|
||||
|
||||
|
||||
// DEBUGGING OPTIONS FOLLOW
|
||||
|
|
@ -198,6 +201,10 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
protected Map<SAMReaderID,SAMFileWriter> nwayWriters = null;
|
||||
|
||||
// debug info for lazy SW evaluation:
|
||||
private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had
|
||||
private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment
|
||||
private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus)
|
||||
public void initialize() {
|
||||
|
||||
if ( LOD_THRESHOLD < 0.0 )
|
||||
|
|
@ -373,6 +380,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
if ( N_WAY_OUT != null ) {
|
||||
SAMReaderID rid = getToolkit().getReaderIDForRead(read);
|
||||
SAMFileWriter w = nwayWriters.get(rid);
|
||||
// reset read's read group from merged to original if read group id collision has happened in merging:
|
||||
if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) {
|
||||
read.setAttribute("RG",
|
||||
getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG")));
|
||||
}
|
||||
w.addAlignment(read);
|
||||
}
|
||||
}
|
||||
|
|
@ -387,6 +399,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
// in initialize() we ensured that every reader has exactly one tag, so the following line is safe:
|
||||
SAMReaderID rid = getToolkit().getReaderIDForRead(read);
|
||||
SAMFileWriter w = nwayWriters.get(rid);
|
||||
// reset read's read group from merged to original if read group id collision has happened in merging:
|
||||
if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) {
|
||||
read.setAttribute("RG",
|
||||
getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG")));
|
||||
}
|
||||
w.addAlignment(read);
|
||||
|
||||
}
|
||||
|
|
@ -525,6 +542,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
if ( N_WAY_OUT != null ) {
|
||||
for ( SAMFileWriter w : nwayWriters.values() ) w.close();
|
||||
}
|
||||
if ( CHECKEARLY ) {
|
||||
logger.info("SW alignments runs: "+SWalignmentRuns);
|
||||
logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)");
|
||||
logger.info("SW alignments skipped (perfect match): "+exactMatchesFound);
|
||||
logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+
|
||||
" ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)");
|
||||
}
|
||||
}
|
||||
|
||||
private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) {
|
||||
|
|
@ -591,7 +615,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
// use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference
|
||||
if ( !USE_KNOWN_INDELS_ONLY )
|
||||
generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference);
|
||||
generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex);
|
||||
|
||||
// if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n");
|
||||
|
||||
|
|
@ -839,12 +863,17 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
return totalRawMismatchSum;
|
||||
}
|
||||
|
||||
private void generateAlternateConsensesFromReads(final LinkedList<AlignedRead> altAlignmentsToTest, final Set<Consensus> altConsensesToPopulate, final byte[] reference) {
|
||||
private void generateAlternateConsensesFromReads(final LinkedList<AlignedRead> altAlignmentsToTest,
|
||||
final Set<Consensus> altConsensesToPopulate,
|
||||
final byte[] reference,
|
||||
final int leftmostIndex) {
|
||||
|
||||
// if we are under the limit, use all reads to generate alternate consenses
|
||||
if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
|
||||
for ( AlignedRead aRead : altAlignmentsToTest )
|
||||
createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
for ( AlignedRead aRead : altAlignmentsToTest ) {
|
||||
if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex);
|
||||
else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
}
|
||||
}
|
||||
// otherwise, choose reads for alternate consenses randomly
|
||||
else {
|
||||
|
|
@ -852,12 +881,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) {
|
||||
int index = generator.nextInt(altAlignmentsToTest.size());
|
||||
AlignedRead aRead = altAlignmentsToTest.remove(index);
|
||||
createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex);
|
||||
else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void createAndAddAlternateConsensus(final byte[] read, final Set<Consensus> altConsensesToPopulate, final byte[] reference) {
|
||||
|
||||
// do a pairwise alignment against the reference
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read);
|
||||
|
|
@ -865,11 +896,33 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
altConsensesToPopulate.add(c);
|
||||
}
|
||||
|
||||
private void createAndAddAlternateConsensus1(AlignedRead read, final Set<Consensus> altConsensesToPopulate,
|
||||
final byte[] reference, final int leftmostIndex) {
|
||||
|
||||
for ( Consensus known : altConsensesToPopulate ) {
|
||||
Pair<Integer, Integer> altAlignment = findBestOffset(known.str, read, leftmostIndex);
|
||||
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
|
||||
int myScore = altAlignment.second;
|
||||
if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer
|
||||
}
|
||||
// do a pairwise alignment against the reference
|
||||
SWalignmentRuns++;
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases());
|
||||
if ( c != null ) {
|
||||
altConsensesToPopulate.add(c);
|
||||
SWalignmentSuccess++;
|
||||
}
|
||||
}
|
||||
|
||||
// create a Consensus from cigar/read strings which originate somewhere on the reference
|
||||
private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) {
|
||||
if ( indexOnRef < 0 )
|
||||
return null;
|
||||
|
||||
// if there are no indels, we do not need this consensus, can abort early:
|
||||
if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null;
|
||||
|
||||
// create the new consensus
|
||||
ArrayList<CigarElement> elements = new ArrayList<CigarElement>(c.numCigarElements()-1);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
|
|
|||
Loading…
Reference in New Issue