Merge pull request #583 from broadinstitute/jt_tabix

Create Tabix indices for block-compressed VCFs
This commit is contained in:
jmthibault79 2014-03-31 16:17:25 -04:00
commit 8703bd7ad4
14 changed files with 107 additions and 24 deletions

View File

@ -29,6 +29,7 @@ import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
import java.io.OutputStream;
@ -48,15 +49,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public static final String NO_HEADER_ARG_NAME = "no_cmdline_in_header";
public static final String SITES_ONLY_ARG_NAME = "sites_only";
public static final String FORCE_BCF = "bcf";
public static final HashSet<String> SUPPORTED_ZIPPED_SUFFIXES = new HashSet<String>();
//
// static list of zipped suffixes supported by this system.
//
static {
SUPPORTED_ZIPPED_SUFFIXES.add(".gz");
SUPPORTED_ZIPPED_SUFFIXES.add(".gzip");
}
public static final HashSet<String> SUPPORTED_ZIPPED_SUFFIXES =
new HashSet<>(Arrays.asList(VariantContextWriterFactory.BLOCK_COMPRESSED_EXTENSIONS));
/**
* The engine into which output stubs should be fed.

View File

@ -213,7 +213,7 @@ public class CatVariants extends CommandLineProgram {
FileOutputStream outputStream = new FileOutputStream(outputFile);
EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile);
final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary());
final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);
boolean firstFile = true;

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.utils.variant;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
@ -33,12 +35,15 @@ import org.broad.tribble.index.IndexCreator;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.index.interval.IntervalIndexCreator;
import org.broad.tribble.index.linear.LinearIndexCreator;
import org.broad.tribble.index.tabix.TabixFormat;
import org.broad.tribble.index.tabix.TabixIndexCreator;
import org.broad.tribble.readers.LineIterator;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.*;
@ -59,6 +64,7 @@ public class GATKVCFUtils {
*/
private GATKVCFUtils() { }
public static final Logger logger = Logger.getLogger(GATKVCFUtils.class);
public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine";
public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type.
@ -192,6 +198,28 @@ public class GATKVCFUtils {
* @return
*/
public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) {
return getIndexCreator(type, parameter, outFile, null);
}
/**
* Create and return an IndexCreator
* @param type
* @param parameter
* @param outFile
* @param sequenceDictionary
* @return
*/
public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile, SAMSequenceDictionary sequenceDictionary) {
if (VCFWriterArgumentTypeDescriptor.isCompressed(outFile.toString())) {
if (type != GATKVCFUtils.DEFAULT_INDEX_TYPE || parameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER)
logger.warn("Creating Tabix index for " + outFile + ", ignoring user-specified index type and parameter");
if (sequenceDictionary == null)
return new TabixIndexCreator(TabixFormat.VCF);
else
return new TabixIndexCreator(sequenceDictionary, TabixFormat.VCF);
}
IndexCreator idxCreator;
switch (type) {
case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break;

View File

@ -29,6 +29,8 @@ import org.apache.commons.lang.StringUtils;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.index.tabix.TabixFormat;
import org.broad.tribble.util.TabixUtils;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -314,8 +316,9 @@ public class WalkerTest extends BaseTest {
String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i);
File fl = createTempFile(String.format("walktest.tmp_param.%d", i), ext);
// Mark corresponding *.idx for deletion on exit as well just in case an index is created for the temp file:
new File(fl.getAbsolutePath() + ".idx").deleteOnExit();
// Mark corresponding indices for deletion on exit as well just in case an index is created for the temp file:
new File(fl.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit();
new File(fl.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION).deleteOnExit();
tmpFiles.add(fl);
}

View File

@ -25,13 +25,17 @@
package org.broadinstitute.sting.utils.variant;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.AbstractIndex;
import org.broad.tribble.index.ChrIndex;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.index.interval.IntervalTreeIndex;
import org.broad.tribble.index.linear.LinearIndex;
import org.broad.tribble.index.tabix.TabixIndex;
import org.broad.tribble.util.TabixUtils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.testng.Assert;
import org.testng.TestException;
@ -216,7 +220,7 @@ public class VCFIntegrationTest extends WalkerTest {
spec.disableShadowBCF();
File outVCF = executeTest(name, spec).first.get(0);
File outIdx = new File(outVCF.getAbsolutePath() + ".idx");
File outIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION);
final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath());
final Index expectedIndex = testSpec.getIndex(outVCF);
@ -272,4 +276,58 @@ public class VCFIntegrationTest extends WalkerTest {
LinkedHashMap<String, ChrIndex> chrIndices = (LinkedHashMap<String, ChrIndex>) f.get(index);
return chrIndices.get(chr);
}
//
//
// Block-Compressed Tabix Index Tests
//
//
private class BlockCompressedIndexCreatorTest extends TestDataProvider {
private final String extension;
private BlockCompressedIndexCreatorTest(String extension) {
super(BlockCompressedIndexCreatorTest.class);
this.extension = extension;
}
public String toString() {
return String.format("File extension %s", extension);
}
}
@DataProvider(name = "BlockCompressedIndexDataProvider")
public Object[][] blockCompressedIndexCreatorData() {
for (String suffix : VCFWriterArgumentTypeDescriptor.SUPPORTED_ZIPPED_SUFFIXES)
new BlockCompressedIndexCreatorTest(".vcf" + suffix);
return TestDataProvider.getTests(BlockCompressedIndexCreatorTest.class);
}
@Test(dataProvider = "BlockCompressedIndexDataProvider")
public void testBlockCompressedIndexCreation(BlockCompressedIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException {
final String commandLine = " -T SelectVariants" +
" -R " + b37KGReference +
" --no_cmdline_in_header" +
" -L 20" +
" -V " + b37_NA12878_OMNI;
final String name = "testBlockCompressedIndexCreation: " + testSpec.toString();
File outVCF = createTempFile("testBlockCompressedIndexCreation", testSpec.extension);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList(""));
spec.disableShadowBCF();
spec.setOutputFileLocation(outVCF);
executeTest(name, spec);
File outTribbleIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION);
Assert.assertFalse(outTribbleIdx.exists(), "testBlockCompressedIndexCreation: Want Tabix index but Tribble index exists: " + outTribbleIdx);
File outTabixIdx = new File(outVCF.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION);
final Index actualIndex = IndexFactory.loadIndex(outTabixIdx.toString());
Assert.assertTrue(actualIndex instanceof TabixIndex, "testBlockCompressedIndexCreation: Want Tabix index but index is not Tabix: " + outTabixIdx);
}
}

View File

@ -3,23 +3,23 @@
<modelVersion>4.0.0</modelVersion>
<groupId>net.sf</groupId>
<artifactId>picard</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
<name>picard</name>
<dependencies>
<dependency>
<groupId>net.sf</groupId>
<artifactId>sam</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
<dependency>
<groupId>org.broadinstitute</groupId>
<artifactId>variant</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
<dependency>
<groupId>org.broad</groupId>
<artifactId>tribble</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
<!-- TODO: Picard is using a custom zip with just ant's BZip2 classes. See also: http://www.kohsuke.org/bzip2 -->
<dependency>

View File

@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>net.sf</groupId>
<artifactId>sam</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
<name>sam-jdk</name>
<dependencies>
<dependency>

View File

@ -3,13 +3,13 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.broad</groupId>
<artifactId>tribble</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
<name>tribble</name>
<dependencies>
<dependency>
<groupId>net.sf</groupId>
<artifactId>sam</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
</dependencies>
</project>

View File

@ -3,18 +3,18 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.broadinstitute</groupId>
<artifactId>variant</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
<name>variant</name>
<dependencies>
<dependency>
<groupId>org.broad</groupId>
<artifactId>tribble</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
<dependency>
<groupId>net.sf</groupId>
<artifactId>sam</artifactId>
<version>1.110.1763</version>
<version>1.110.1773</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@ -43,7 +43,7 @@
<test.args>-Xmx${test.maxmemory} -XX:+UseParallelOldGC -XX:ParallelGCThreads=${java.gc.threads} -XX:GCTimeLimit=${java.gc.timeLimit} -XX:GCHeapFreeLimit=${java.gc.heapFreeLimit}</test.args>
<!-- Version numbers for picard sam-jdk. Usually kept in sync. -->
<picard.public.version>1.110.1763</picard.public.version>
<picard.public.version>1.110.1773</picard.public.version>
<sam.version>${picard.public.version}</sam.version>
<picard.version>${picard.public.version}</picard.version>
<variant.version>${picard.public.version}</variant.version>