From e75366f738fe070e80a081dea5a066948d007466 Mon Sep 17 00:00:00 2001 From: hanna Date: Mon, 28 Feb 2011 17:32:12 +0000 Subject: [PATCH] Fixed performance issue in protosharding code -- turns out that the index optimizer was mutating the data stored in the indices. Protosharding still disabled by default. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5334 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/net/sf/samtools/GATKBAMFileSpan.java | 7 ++ .../reads/BAMIndexBinIterator.java | 2 - .../reads/LowMemoryIntervalSharder.java | 8 +- .../reads/utilities/PrintBAMRegion.java | 115 ++++++++++++++++++ 4 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java diff --git a/java/src/net/sf/samtools/GATKBAMFileSpan.java b/java/src/net/sf/samtools/GATKBAMFileSpan.java index 13c40c3a2..702248060 100644 --- a/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -68,4 +68,11 @@ public class GATKBAMFileSpan extends BAMFileSpan { gatkChunks.add(new GATKChunk(chunk)); return gatkChunks; } + + public String toString() { + StringBuilder builder = new StringBuilder(); + for(GATKChunk chunk: getGATKChunks()) + builder.append(String.format("%s;",chunk)); + return builder.toString(); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java index ef22f9b43..fbc7b8c3e 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java @@ -110,8 +110,6 @@ public class BAMIndexBinIterator { binPositionBuffer.putLong(position); binPositionBuffer.flip(); - System.out.printf("Writing bin number %d to position %d: coordinate = %d%n",indexBin,indexBin*Long.SIZE*8,position); - metaIndexChannel.write(binPositionBuffer); binPositionBuffer.flip(); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index 6f0d52fd6..45ad7e081 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -182,12 +182,16 @@ public class LowMemoryIntervalSharder implements Iterator { for(GATKBin bin: binTree.getBins()) { if(bin == null) continue; - chunks.addAll(Arrays.asList(bin.getChunkList())); + // The optimizer below will mutate the chunk list. Make sure each element is a clone of the reference sequence. + for(GATKChunk chunk: bin.getChunkList()) + chunks.add(chunk.clone()); } // Optimize the chunk list with a linear index optimization chunks = index.optimizeChunkList(chunks,index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart())); - return new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()])); + GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()])); + + return fileSpan; } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java new file mode 100644 index 000000000..ed3160996 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.utilities; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecordIterator; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 3:25:13 PM + * To change this template use File | Settings | File Templates. + */ +public class PrintBAMRegion extends CommandLineProgram { + @Argument(fullName="input",shortName="I",doc="Input file to process",required=true) + private File input = null; + + @Argument(fullName="region",shortName="R",doc="BAM region to process, in chunk format (mmmm:nn-xxxx:yy)",required=true) + private String region; + + private static final long MIN_BLOCK_SIZE = 0; + private static final long MAX_BLOCK_SIZE = (long)Math.pow(2,48)-1; + private static final int MIN_OFFSET_SIZE = 0; + private static final int MAX_OFFSET_SIZE = (int)Math.pow(2,16)-1; + + public int execute() { + SAMFileReader reader = new SAMFileReader(input); + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + Pattern regionPattern = Pattern.compile("(\\d+):(\\d+)-(\\d+):(\\d+)"); + Matcher matcher = regionPattern.matcher(region); + if(!matcher.matches()) + throw new UserException("BAM region to process must be in chunk format (mmmm:nn-xxxx:yy)"); + + long firstBlock = Long.parseLong(matcher.group(1)); + int firstOffset = Integer.parseInt(matcher.group(2)); + long lastBlock = Long.parseLong(matcher.group(3)); + int lastOffset = Integer.parseInt(matcher.group(4)); + + if(firstBlock < MIN_BLOCK_SIZE || firstBlock > MAX_BLOCK_SIZE) + throw new UserException(String.format("First block is invalid; must be between %d and %d; actually is %d",MIN_BLOCK_SIZE,MAX_BLOCK_SIZE,firstBlock)); + if(lastBlock < MIN_BLOCK_SIZE || lastBlock > MAX_BLOCK_SIZE) + throw new UserException(String.format("Last block is invalid; must be between %d and %d; actually is %d",MIN_BLOCK_SIZE,MAX_BLOCK_SIZE,lastBlock)); + if(firstOffset < MIN_OFFSET_SIZE || firstOffset > MAX_OFFSET_SIZE) + throw new UserException(String.format("First offset is invalid; must be between %d and %d; actually is %d",MIN_OFFSET_SIZE,MAX_OFFSET_SIZE,firstOffset)); + if(lastOffset < MIN_OFFSET_SIZE || lastOffset > MAX_OFFSET_SIZE) + throw new UserException(String.format("Last offset is invalid; must be between %d and %d; actually is %d",MIN_OFFSET_SIZE,MAX_OFFSET_SIZE,lastOffset)); + + GATKChunk chunk = new GATKChunk(firstBlock<<16 | firstOffset,lastBlock<<16 | lastOffset); + GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunk); + + SAMRecordIterator iterator = reader.iterator(fileSpan); + long readCount = 0; + while(iterator.hasNext()) { + System.out.printf("%s%n",iterator.next().format()); + readCount++; + } + System.out.printf("%d reads shown.",readCount); + + iterator.close(); + reader.close(); + + return 0; + } + + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + try { + PrintBAMRegion instance = new PrintBAMRegion(); + start(instance, argv); + System.exit(0); + } + catch(Exception ex) { + ex.printStackTrace(); + System.exit(1); + } + } +}