Added support for a general GATK option --simplifyBAM to automatically remove and simplify kept reads in an output BAM file. Specifically, duplicate, non-PF, and unmapped reads are removed, and all extended tags in the retained SAM records are removed except the RG:Z tag. This option is very useful when creating temporary BAM files (merged per-population or multi-sample cleaned) for future calling (as in the 1000G processing pipeline). Results in a significant reduction in space of the resulting BAM, faster reading of the BAM, and surprisingly even faster UG performance:
1-10mb of chromosome one, from NA12878 HiSeq 64x data set on hg18: Full BAM Write time: 8.6 m Size: 866M CountReads time: 2.9 m UG time: 11.3 m Simplified BAM: Write time: 6.2 Size: 458M CountReads time: 85.7 s UG time: 10.1 m git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5517 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fc8acd503e
commit
27c8fb1e4d
|
|
@ -118,6 +118,10 @@ public class GATKArgumentCollection {
|
|||
@Input(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
|
||||
public String DBSNPFile = null;
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", required = false)
|
||||
public boolean simplifyBAM = false;
|
||||
|
||||
/**
|
||||
* The override mechanism in the GATK, by default, populates the command-line arguments, then
|
||||
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
|
||||
|
|
@ -435,6 +439,9 @@ public class GATKArgumentCollection {
|
|||
if (enableLowMemorySharding != other.enableLowMemorySharding)
|
||||
return false;
|
||||
|
||||
if ( simplifyBAM != other.simplifyBAM )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import net.sf.samtools.util.RuntimeIOException;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.SimplifyingSAMFileWriter;
|
||||
|
||||
/**
|
||||
* Provides temporary storage for SAMFileWriters.
|
||||
|
|
@ -81,6 +82,11 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage<SAMFileWrite
|
|||
}
|
||||
else
|
||||
throw new UserException("Unable to write to SAM file; neither a target file nor a stream has been specified");
|
||||
|
||||
// if we want to send the BAM file through the simplifying writer, wrap it here
|
||||
if ( stub.simplifyBAM() ) {
|
||||
this.writer = new SimplifyingSAMFileWriter(this.writer);
|
||||
}
|
||||
}
|
||||
|
||||
public SAMFileHeader getFileHeader() {
|
||||
|
|
|
|||
|
|
@ -137,6 +137,10 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
return samFile;
|
||||
}
|
||||
|
||||
public boolean simplifyBAM() {
|
||||
return engine.getArguments().simplifyBAM;
|
||||
}
|
||||
|
||||
public OutputStream getSAMOutputStream() {
|
||||
return samOutputStream;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* XXX
|
||||
*/
|
||||
public class SimplifyingSAMFileWriter implements SAMFileWriter {
|
||||
final SAMFileWriter dest;
|
||||
|
||||
public SimplifyingSAMFileWriter(final SAMFileWriter finalDestination) {
|
||||
this.dest = finalDestination;
|
||||
}
|
||||
|
||||
public void addAlignment( SAMRecord read ) {
|
||||
if ( keepRead(read) ) {
|
||||
dest.addAlignment(simplifyRead(read));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the header to use when creating the new SAM file.
|
||||
* @return header to use when creating the new SAM file.
|
||||
*/
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return dest.getFileHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
public void close() {
|
||||
dest.close();
|
||||
}
|
||||
|
||||
|
||||
public static final boolean keepRead(SAMRecord read) {
|
||||
return ! excludeRead(read);
|
||||
}
|
||||
|
||||
public static final boolean excludeRead(SAMRecord read) {
|
||||
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag() || read.getNotPrimaryAlignmentFlag();
|
||||
}
|
||||
|
||||
public static final SAMRecord simplifyRead(SAMRecord read) {
|
||||
// the only attribute we keep is the RG
|
||||
Object rg = read.getAttribute("RG");
|
||||
read.clearAttributes();
|
||||
read.setAttribute("RG", rg);
|
||||
return read;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue