Stand alone, GATK dependent tool that Reads a list of BAM files and slices all of them into a single merged BAM file containing reads in overlapping chr:start-stop interval. Highly efficient when working with thousands of BAM files. Can merge 1MB of sequence of 1600 4x BAMs in 4g in only 2 hours.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5558 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-04-02 13:41:29 +00:00
parent 0b02804dcb
commit 11822da578
2 changed files with 176 additions and 0 deletions

View File

@ -419,6 +419,15 @@
<attribute name="Main-Class" value="org.broadinstitute.sting.oneoffprojects.tools.CompareBAMAlignments" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/SliceBams.jar" whenmanifestonly="skip">
<fileset dir="${java.classes}">
<include name="**/tools/**/*.class" />
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.playground.tools.SliceBams" />
</manifest>
</jar>
</target>
<target name="scala.jar" depends="scala.compile, init.jar" if="scala.include">
@ -483,6 +492,11 @@
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/SliceBams.jar" update="true" whenmanifestonly="skip">
<manifest>
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
</target>
<target name="queue.manifests" depends="queue.jar, init.manifests" if="scala.include">

View File

@ -0,0 +1,162 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.tools;
import net.sf.picard.PicardException;
import net.sf.picard.cmdline.CommandLineProgram;
import net.sf.picard.cmdline.Option;
import net.sf.picard.cmdline.StandardOptionDefinitions;
import net.sf.picard.cmdline.Usage;
import net.sf.picard.io.IoUtil;
import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.util.Log;
import net.sf.samtools.*;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.SimplifyingSAMFileWriter;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
/**
* Reads a list of BAM files and slices all of them into a single merged BAM file
* containing reads in overlapping chr:start-stop interval.
*
* @author Mark DePristo
*/
public class SliceBams extends CommandLineProgram {
private static final Log log = Log.getInstance(SliceBams.class);
// Usage and parameters
@Usage
public String USAGE = "Merges multiple SAM/BAM files into one BAM overlapping chr:start-stop interval .\n";
@Option(shortName="I", doc="List of input BAM files")
public File INPUT_LIST;
@Option(shortName="O", doc="SAM or BAM file to write merged result to")
public File OUTPUT;
@Option(shortName="L", doc="Location to include")
public String SLICE;
private static final int PROGRESS_INTERVAL = 1000000;
/** Required main method implementation. */
public static void main(final String[] argv) {
System.exit(new SliceBams().instanceMain(argv));
}
private List<File> parseInputFiles(File list) {
try {
final List<File> files = new ArrayList<File>();
for (String fileName : new XReadLines(list).readLines() ) {
files.add(new File(fileName));
}
return files;
} catch ( FileNotFoundException e ) {
throw new PicardException("Couldn't read input list", e);
}
}
/**
* Walk over the input files, reading the headers, and finally prepare the output
* BAM containing a merge of all of the headers.
*
* @param inputBAMs
* @return
*/
private SAMFileWriter createOutputBAM(List<File> inputBAMs) {
SAMFileHeader header = null;
log.info("Reading headers");
int fileCounter = 1;
for (final File inFile : inputBAMs) {
IoUtil.assertFileIsReadable(inFile);
final SAMFileReader inReader = new SAMFileReader(inFile, null); // null because we don't want it to look for the index
final SAMFileHeader inHeader = inReader.getFileHeader();
log.info(" Reading header from file " + inFile + " " + fileCounter++ + " of " + inputBAMs.size());
if (header == null) {
header = inHeader;
}
else {
for ( SAMReadGroupRecord rg : inHeader.getReadGroups() )
header.addReadGroup(rg);
}
inReader.close();
}
SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT);
return new SimplifyingSAMFileWriter(out);
}
/** Combines multiple SAM/BAM files into one. */
@Override
protected int doWork() {
SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
SAMFileWriterFactory.setDefaultCreateIndexWhileWriting(true);
// Open the files for reading and writing
List<File> inputBAMs = parseInputFiles(INPUT_LIST);
IoUtil.assertFileIsWritable(OUTPUT);
final SAMFileWriter out = createOutputBAM(inputBAMs);
GenomeLocParser glParser = new GenomeLocParser(out.getFileHeader().getSequenceDictionary());
GenomeLoc loc = glParser.parseGenomeLoc(SLICE);
log.info("Reading BAM records");
long numRecords = 1;
int fileCounter = 1;
for (final File inFile : inputBAMs) {
IoUtil.assertFileIsReadable(inFile);
log.info(" Reading file " + inFile + " " + fileCounter++ + " of " + inputBAMs.size());
final SAMFileReader reader = new SAMFileReader(inFile);
SAMRecordIterator iterator = reader.queryOverlapping(loc.getContig(), loc.getStart(), loc.getStop());
while ( iterator.hasNext() ) {
final SAMRecord record = iterator.next();
out.addAlignment(record);
if (numRecords % PROGRESS_INTERVAL == 0) {
log.info(numRecords + " records read.");
}
}
reader.close();
}
log.info("Finished reading inputs.");
log.info("Sorting final output file.");
out.close();
return 0;
}
}