From 27c8fb1e4d60cbaa0a04f3cb39968897f571760e Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 26 Mar 2011 01:21:35 +0000 Subject: [PATCH] Added support for a general GATK option --simplifyBAM to automatically remove and simplify kept reads in an output BAM file. Specifically, duplicate, non-PF, and unmapped reads are removed, and all extended tags in the retained SAM records are removed except the RG:Z tag. This option is very useful when creating temporary BAM files (merged per-population or multi-sample cleaned) for future calling (as in the 1000G processing pipeline). Results in a significant reduction in space of the resulting BAM, faster reading of the BAM, and surprisingly even faster UG performance: 1-10mb of chromosome one, from NA12878 HiSeq 64x data set on hg18: Full BAM Write time: 8.6 m Size: 866M CountReads time: 2.9 m UG time: 11.3 m Simplified BAM: Write time: 6.2 Size: 458M CountReads time: 85.7 s UG time: 10.1 m git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5517 348d0f76-0448-11de-a6fe-93d51630548a --- .../arguments/GATKArgumentCollection.java | 7 +++ .../gatk/io/storage/SAMFileWriterStorage.java | 6 ++ .../gatk/io/stubs/SAMFileWriterStub.java | 4 ++ .../utils/sam/SimplifyingSAMFileWriter.java | 62 +++++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 4e81b8294..890b0830f 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -118,6 +118,10 @@ public class GATKArgumentCollection { @Input(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false) public String DBSNPFile = null; + @Element(required = false) + @Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", required = false) + public boolean simplifyBAM = false; + /** * The override mechanism in the GATK, by default, populates the command-line arguments, then * the defaults from the walker annotations. Unfortunately, walker annotations should be trumped @@ -435,6 +439,9 @@ public class GATKArgumentCollection { if (enableLowMemorySharding != other.enableLowMemorySharding) return false; + if ( simplifyBAM != other.simplifyBAM ) + return false; + return true; } diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index 03a95c30b..a3da3156c 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -36,6 +36,7 @@ import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.SimplifyingSAMFileWriter; /** * Provides temporary storage for SAMFileWriters. @@ -81,6 +82,11 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage, StingSAMFileWrite return samFile; } + public boolean simplifyBAM() { + return engine.getArguments().simplifyBAM; + } + public OutputStream getSAMOutputStream() { return samOutputStream; } diff --git a/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java b/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java new file mode 100644 index 000000000..df2010e8b --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * XXX + */ +public class SimplifyingSAMFileWriter implements SAMFileWriter { + final SAMFileWriter dest; + + public SimplifyingSAMFileWriter(final SAMFileWriter finalDestination) { + this.dest = finalDestination; + } + + public void addAlignment( SAMRecord read ) { + if ( keepRead(read) ) { + dest.addAlignment(simplifyRead(read)); + + } + } + + /** + * Retrieves the header to use when creating the new SAM file. + * @return header to use when creating the new SAM file. + */ + public SAMFileHeader getFileHeader() { + return dest.getFileHeader(); + } + + /** + * @{inheritDoc} + */ + public void close() { + dest.close(); + } + + + public static final boolean keepRead(SAMRecord read) { + return ! excludeRead(read); + } + + public static final boolean excludeRead(SAMRecord read) { + return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag() || read.getNotPrimaryAlignmentFlag(); + } + + public static final SAMRecord simplifyRead(SAMRecord read) { + // the only attribute we keep is the RG + Object rg = read.getAttribute("RG"); + read.clearAttributes(); + read.setAttribute("RG", rg); + return read; + } +}