A 'fat shard' finder. Cranks through the indices of a BAM file or list of

BAM files looking for outliers (outliers right now are defined naively  as 
shards whose sizes are more than 5 stddevs away from the mean).  Runs in
13 minutes per chromosome on 707 low pass whole genome BAMs -- not great, but
much faster than running UG on the same region to discover anomalies.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5782 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2011-05-10 12:56:47 +00:00
parent 3ffc2ccd81
commit f275be6968
6 changed files with 356 additions and 115 deletions

View File

@ -47,6 +47,7 @@ import java.util.*;
import net.sf.picard.filter.SamRecordFilter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.ListFileUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
/**
@ -72,11 +73,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
*/
private final Collection<Object> argumentSources = new ArrayList<Object>();
/**
* Lines starting with this String in .list files are considered comments.
*/
public static final String LIST_FILE_COMMENT_START = "#";
/**
* this is the function that the inheriting class can expect to have called
* when the command line system has initialized.
@ -93,8 +89,8 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
engine.setArguments(getArgumentCollection());
// File lists can require a bit of additional expansion. Set these explicitly by the engine.
engine.setSAMFileIDs(unpackBAMFileList(getArgumentCollection()));
engine.setReferenceMetaDataFiles(unpackRODBindings(getArgumentCollection()));
engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser));
engine.setReferenceMetaDataFiles(ListFileUtils.unpackRODBindings(getArgumentCollection().RODBindings,getArgumentCollection().DBSNPFile,parser));
engine.setWalker(walker);
walker.setToolkit(engine);
@ -200,104 +196,4 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
return engine.getWalkerName((Class<Walker>)argumentSource);
}
/**
* Unpack the bam files to be processed, given a list of files. That list of files can
* itself contain entries which are lists of other files to be read (note: you cannot have lists
* of lists of lists). Lines in .list files containing only whitespace or which begin with
* LIST_FILE_COMMENT_START are ignored.
*
* @param argCollection the command-line arguments from which to extract the BAM file list.
* @return a flattened list of the bam files provided
*/
protected List<SAMReaderID> unpackBAMFileList(GATKArgumentCollection argCollection) {
List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
for( String inputFileName: argCollection.samFiles ) {
Tags inputFileNameTags = parser.getTags(inputFileName);
inputFileName = expandFileName(inputFileName);
if (inputFileName.toLowerCase().endsWith(".list") ) {
try {
for ( String fileName : new XReadLines(new File(inputFileName), true) ) {
if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) {
unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName)));
}
}
}
catch( FileNotFoundException ex ) {
throw new UserException.CouldNotReadInputFile(new File(inputFileName), "Unable to find file while unpacking reads", ex);
}
}
else if(inputFileName.toLowerCase().endsWith(".bam")) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else if(inputFileName.endsWith("stdin")) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else {
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +
"of BAM files is in the correct format, update the extension, and try again.",inputFileName));
}
}
return unpackedReads;
}
/**
* Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
* @param argCollection input arguments to the GATK.
* @return a list of expanded, bound RODs.
*/
private Collection<RMDTriplet> unpackRODBindings(GATKArgumentCollection argCollection) {
Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
for (String fileName: argCollection.RODBindings) {
final Tags tags = parser.getTags(fileName);
fileName = expandFileName(fileName);
List<String> positionalTags = tags.getPositionalTags();
if(positionalTags.size() != 2)
throw new UserException("Invalid syntax for -B (reference-ordered data) input flag. " +
"Please use the following syntax when providing reference-ordered " +
"data: -B:<name>,<type> <filename>.");
// Assume that if tags are present, those tags are name and type.
// Name is always first, followed by type.
String name = positionalTags.get(0);
String type = positionalTags.get(1);
RMDStorageType storageType = null;
if(tags.getValue("storage") != null)
storageType = Enum.valueOf(RMDStorageType.class,tags.getValue("storage"));
else if(fileName.toLowerCase().endsWith("stdin"))
storageType = RMDStorageType.STREAM;
else
storageType = RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags));
}
if (argCollection.DBSNPFile != null) {
if(argCollection.DBSNPFile.toLowerCase().contains("vcf"))
throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf <filename>.");
final Tags tags = parser.getTags(argCollection.DBSNPFile);
String fileName = expandFileName(argCollection.DBSNPFile);
RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDStorageType.STREAM : RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType,tags));
}
return rodBindings;
}
/**
* Expand any special characters that appear in the filename. Right now, '-' is expanded to
* '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed
* directly to the command line in some circumstances could be expanded as well. Be careful
* when adding UNIX-isms.
* @param argument the text appearing on the command-line.
* @return An expanded string suitable for opening by Java/UNIX file handling utilities.
*/
private String expandFileName(String argument) {
if(argument.trim().equals("-"))
return "/dev/stdin";
return argument;
}
}

View File

@ -46,7 +46,7 @@ import java.util.TreeMap;
/**
* Represents a small section of a BAM file, and every associated interval.
*/
class FilePointer {
public class FilePointer {
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
protected final BAMOverlap overlap;
protected final List<GenomeLoc> locations;
@ -72,6 +72,14 @@ class FilePointer {
this.isRegionUnmapped = false;
}
/**
* Returns an immutable variant of the list of locations.
* @return
*/
public List<GenomeLoc> getLocations() {
return Collections.unmodifiableList(locations);
}
@Override
public boolean equals(final Object other) {
if(!(other instanceof FilePointer))

View File

@ -150,7 +150,7 @@ public class SAMDataSource {
new ArrayList<ReadFilter>(),
false,
false,
false);
true);
}
/**

View File

@ -0,0 +1,187 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.utilities;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.text.ListFileUtils;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
/**
* Traverses a region in a dataset looking for outliers.
*/
public class FindLargeShards extends CommandLineProgram {
private static Logger logger = Logger.getLogger(FindLargeShards.class);
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List<String> samFiles = new ArrayList<String>();
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false)
public List<String> intervals = null;
/**
* The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be
* up to (2^64)^2.
*/
private BigInteger sumOfSquares = BigInteger.valueOf(0);
/**
* The running sum of all uncompressed data. Based on the BAM spec, the BAM must be less than Long.MAX_LONG
* when compressed -- in other words, the sum of the sizes of all BGZF blocks must be < 2^64.
*/
private BigInteger sum = BigInteger.valueOf(0);
/**
* The number of shards viewed.
*/
private long numberOfShards;
@Override
public int execute() throws IOException {
// initialize reference
IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile);
GenomeLocParser genomeLocParser = new GenomeLocParser(refReader);
// initialize reads
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
// intervals
GenomeLocSortedSet intervalSortedSet = null;
if(intervals != null)
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals, true), IntervalMergingRule.ALL);
else {
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())
intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength()));
}
logger.info(String.format("Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
// Size of the file pointer.
final long size = filePointer.size();
BigInteger bigSize = BigInteger.valueOf(size);
sumOfSquares = sumOfSquares.add(bigSize.pow(2));
sum = sum.add(bigSize);
numberOfShards++;
if(numberOfShards % 1000 == 0) {
GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser);
logger.info(String.format("Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
}
}
// Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N
long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue();
long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue()));
logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev));
// Crank through the shards again, this time reporting on the shards significantly larger than the mean.
long threshold = mean + stddev*5;
logger.warn(String.format("Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
// Bounding region.
GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser);
// Size of the file pointer.
final long size = filePointer.size();
numberOfShards++;
if(filePointer.size() <= threshold) {
if(numberOfShards % 1000 == 0)
logger.info(String.format("%s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
continue;
}
logger.warn(String.format("FOUND LARGE SHARD: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
}
return 0;
}
private GenomeLoc getBoundingRegion(final FilePointer filePointer, final GenomeLocParser genomeLocParser) {
List<GenomeLoc> regions = filePointer.getLocations();
// The region contained by this FilePointer.
final String contig = regions.get(0).getContig();
final int start = regions.get(0).getStart();
final int stop = regions.get(regions.size()-1).getStop();
return genomeLocParser.createGenomeLoc(contig,start,stop);
}
/**
* Required main method implementation.
* @param argv Command-line argument text.
* @throws Exception on error.
*/
public static void main(String[] argv) throws Exception {
int returnCode = 0;
try {
FindLargeShards instance = new FindLargeShards();
start(instance, argv);
returnCode = 0;
}
catch(Exception ex) {
returnCode = 1;
ex.printStackTrace();
throw ex;
}
finally {
System.exit(returnCode);
}
}
}

View File

@ -0,0 +1,152 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.text;
import org.broadinstitute.sting.commandline.ParsingEngine;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* A collection of convenience methods for working with list files.
*/
public class ListFileUtils {
/**
* Lines starting with this String in .list files are considered comments.
*/
public static final String LIST_FILE_COMMENT_START = "#";
/**
* Unpack the bam files to be processed, given a list of files. That list of files can
* itself contain entries which are lists of other files to be read (note: you cannot have lists
* of lists of lists). Lines in .list files containing only whitespace or which begin with
* LIST_FILE_COMMENT_START are ignored.
*
* @param samFiles The sam files, in string format.
* @return a flattened list of the bam files provided
*/
public static List<SAMReaderID> unpackBAMFileList(final List<String> samFiles, final ParsingEngine parser) {
List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
for( String inputFileName: samFiles ) {
Tags inputFileNameTags = parser.getTags(inputFileName);
inputFileName = expandFileName(inputFileName);
if (inputFileName.toLowerCase().endsWith(".list") ) {
try {
for ( String fileName : new XReadLines(new File(inputFileName), true) ) {
if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) {
unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName)));
}
}
}
catch( FileNotFoundException ex ) {
throw new UserException.CouldNotReadInputFile(new File(inputFileName), "Unable to find file while unpacking reads", ex);
}
}
else if(inputFileName.toLowerCase().endsWith(".bam")) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else if(inputFileName.endsWith("stdin")) {
unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
}
else {
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +
"of BAM files is in the correct format, update the extension, and try again.",inputFileName));
}
}
return unpackedReads;
}
/**
* Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
* @param RODBindings a text equivale
* @return a list of expanded, bound RODs.
*/
public static Collection<RMDTriplet> unpackRODBindings(final List<String> RODBindings, final String dbSNPFile, final ParsingEngine parser) {
Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
for (String fileName: RODBindings) {
final Tags tags = parser.getTags(fileName);
fileName = expandFileName(fileName);
List<String> positionalTags = tags.getPositionalTags();
if(positionalTags.size() != 2)
throw new UserException("Invalid syntax for -B (reference-ordered data) input flag. " +
"Please use the following syntax when providing reference-ordered " +
"data: -B:<name>,<type> <filename>.");
// Assume that if tags are present, those tags are name and type.
// Name is always first, followed by type.
String name = positionalTags.get(0);
String type = positionalTags.get(1);
RMDTriplet.RMDStorageType storageType = null;
if(tags.getValue("storage") != null)
storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage"));
else if(fileName.toLowerCase().endsWith("stdin"))
storageType = RMDTriplet.RMDStorageType.STREAM;
else
storageType = RMDTriplet.RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags));
}
if (dbSNPFile != null) {
if(dbSNPFile.toLowerCase().contains("vcf"))
throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf <filename>.");
final Tags tags = parser.getTags(dbSNPFile);
String fileName = expandFileName(dbSNPFile);
RMDTriplet.RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDTriplet.RMDStorageType.STREAM : RMDTriplet.RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType,tags));
}
return rodBindings;
}
/**
* Expand any special characters that appear in the filename. Right now, '-' is expanded to
* '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed
* directly to the command line in some circumstances could be expanded as well. Be careful
* when adding UNIX-isms.
* @param argument the text appearing on the command-line.
* @return An expanded string suitable for opening by Java/UNIX file handling utilities.
*/
private static String expandFileName(String argument) {
if(argument.trim().equals("-"))
return "/dev/stdin";
return argument;
}
}

View File

@ -22,11 +22,12 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk;
package org.broadinstitute.sting.utils.text;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.ParsingEngine;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.testng.Assert;
@ -42,7 +43,7 @@ import java.util.List;
/**
* Tests selected functionality in the CommandLineExecutable class
*/
public class CommandLineExecutableUnitTest extends BaseTest {
public class ListFileUtilsUnitTest extends BaseTest {
@Test
public void testIgnoreBlankLinesInBAMListFiles() throws Exception {
@ -90,13 +91,10 @@ public class CommandLineExecutableUnitTest extends BaseTest {
List<String> bamFiles = new ArrayList<String>();
bamFiles.add(tempListFile.getAbsolutePath());
GATKArgumentCollection argCollection = new GATKArgumentCollection();
argCollection.samFiles = bamFiles;
CommandLineGATK testInstance = new CommandLineGATK();
testInstance.setParser(new ParsingEngine(testInstance));
List<SAMReaderID> unpackedBAMFileList = testInstance.unpackBAMFileList(argCollection);
List<SAMReaderID> unpackedBAMFileList = ListFileUtils.unpackBAMFileList(bamFiles,new ParsingEngine(testInstance));
Assert.assertEquals(unpackedBAMFileList.size(), expectedUnpackedFileList.size(),
"Unpacked BAM file list contains extraneous lines");