Added support for a general GATK option --simplifyBAM to automatically remove and simplify kept reads in an output BAM file. Specifically, duplicate, non-PF, and unmapped reads are removed, and all extended tags in the retained SAM records are removed except the RG:Z tag. This option is very useful when creating temporary BAM files (merged per-population or multi-sample cleaned) for future calling (as in the 1000G processing pipeline). Results in a significant reduction in space of the resulting BAM, faster reading of the BAM, and surprisingly even faster UG performance:

1-10mb of chromosome one, from NA12878 HiSeq 64x data set on hg18:

Full BAM
Write time: 8.6 m
Size: 866M
CountReads time: 2.9 m
UG time: 11.3 m

Simplified BAM:
Write time: 6.2
Size: 458M
CountReads time: 85.7 s
UG time: 10.1 m


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5517 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-26 01:21:35 +00:00
parent fc8acd503e
commit 27c8fb1e4d
4 changed files with 79 additions and 0 deletions

View File

@ -118,6 +118,10 @@ public class GATKArgumentCollection {
@Input(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
public String DBSNPFile = null;
@Element(required = false)
@Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", required = false)
public boolean simplifyBAM = false;
/**
* The override mechanism in the GATK, by default, populates the command-line arguments, then
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
@ -435,6 +439,9 @@ public class GATKArgumentCollection {
if (enableLowMemorySharding != other.enableLowMemorySharding)
return false;
if ( simplifyBAM != other.simplifyBAM )
return false;
return true;
}

View File

@ -36,6 +36,7 @@ import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.SimplifyingSAMFileWriter;
/**
* Provides temporary storage for SAMFileWriters.
@ -81,6 +82,11 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage<SAMFileWrite
}
else
throw new UserException("Unable to write to SAM file; neither a target file nor a stream has been specified");
// if we want to send the BAM file through the simplifying writer, wrap it here
if ( stub.simplifyBAM() ) {
this.writer = new SimplifyingSAMFileWriter(this.writer);
}
}
public SAMFileHeader getFileHeader() {

View File

@ -137,6 +137,10 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
return samFile;
}
public boolean simplifyBAM() {
return engine.getArguments().simplifyBAM;
}
public OutputStream getSAMOutputStream() {
return samOutputStream;
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* XXX
*/
public class SimplifyingSAMFileWriter implements SAMFileWriter {
final SAMFileWriter dest;
public SimplifyingSAMFileWriter(final SAMFileWriter finalDestination) {
this.dest = finalDestination;
}
public void addAlignment( SAMRecord read ) {
if ( keepRead(read) ) {
dest.addAlignment(simplifyRead(read));
}
}
/**
* Retrieves the header to use when creating the new SAM file.
* @return header to use when creating the new SAM file.
*/
public SAMFileHeader getFileHeader() {
return dest.getFileHeader();
}
/**
* @{inheritDoc}
*/
public void close() {
dest.close();
}
public static final boolean keepRead(SAMRecord read) {
return ! excludeRead(read);
}
public static final boolean excludeRead(SAMRecord read) {
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag() || read.getNotPrimaryAlignmentFlag();
}
public static final SAMRecord simplifyRead(SAMRecord read) {
// the only attribute we keep is the RG
Object rg = read.getAttribute("RG");
read.clearAttributes();
read.setAttribute("RG", rg);
return read;
}
}