Merge pull request #890 from broadinstitute/rhl_writers_use_index_args

Automatically choose indexing strategy by file extension
This commit is contained in:
droazen 2015-03-27 15:00:58 -04:00
commit 1f81c033f5
9 changed files with 346 additions and 94 deletions

View File

@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.io.DirectOutputTracker;
@ -71,6 +72,7 @@ import org.broadinstitute.gatk.engine.filters.BadMateFilter;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
@ -709,10 +711,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
ReferenceConfidenceModel referenceConfidenceModel = null;
// as determined experimentally Nov-Dec 2013
public final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
public final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000;
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@ -893,10 +891,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently. Use the sample_name argument to run on a single sample out of a multi-sample BAM file.");
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
// a kluge to enforce the use of this indexing strategy
if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE ||
getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) {
throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER);
// A kluge to enforce the use of this indexing strategy - must set the gVCF indexing values if not a using a gVCF output file .
// An output gVCF file automatically sets the indexing values because it has the .g.vcf extension.
if (!GATKVCFUtils.usingGVCFIndexingArguments(getToolkit().getArguments().variant_index_type, getToolkit().getArguments().variant_index_parameter) && !isGVCF()) {
throw new UserException.GVCFIndexException(GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
}
try {
@ -1404,4 +1402,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
activeRegion.removeAll( readsToRemove );
}
/**
* Is writing to an output GVCF file?
*
* @return true if the VCF output file has a .g.vcf extension
*/
private boolean isGVCF() {
return ((VariantContextWriterStub) vcfWriter).getOutputFile().getName().endsWith("." + GATKVCFUtils.GVCF_EXT);
}
}

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
@ -151,7 +152,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(dataProvider = "MyDataProvider")
public void testHCWithGVCF(String bam, ReferenceConfidenceMode mode, String intervals, String md5) {
final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
@ -163,7 +164,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(dataProvider = "MyDataProviderHaploid", enabled=true)
public void testHCWithGVCFHaploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) {
final String commandLine = String.format("-T HaplotypeCaller -ploidy 1 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final String name = "testHCWithGVCFHaploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
@ -175,7 +176,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(dataProvider = "MyDataProviderTetraploid", enabled=true)
public void testHCWithGVCFTetraploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) {
final String commandLine = String.format("-T HaplotypeCaller -ploidy 4 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final String name = "testHCWithGVCFTetraploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
@ -187,7 +188,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(dataProvider = "MyDataProviderManyploid", enabled=true)
public void testHCWithGVCFManyploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) {
final String commandLine = String.format("-T HaplotypeCaller -ploidy 33 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final String name = "testHCWithGVCFManyploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
@ -196,7 +197,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test
public void testERCRegionWithNoCalledHaplotypes() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF();
executeTest("testERCRegionWithNoCalledHaplotypes", spec);
@ -211,10 +212,22 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
executeTest("testMissingGVCFIndexingStrategyException", spec);
}
/**
* Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf output file is specified and the indexing arguments are omitted
*/
@Test()
public void testGVCFIndexNoThrow() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_EXT), Arrays.asList(""));
spec.disableShadowBCF();
executeTest("testGVCFIndexNoThrow", spec);
}
@Test()
public void testWrongParameterGVCFIndexException() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER + 1);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class);
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
@ -224,11 +237,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testWrongTypeGVCFIndexException() {
// ensure non-optimal, if optimal changes
GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK;
if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK)
if (GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK)
type = GATKVCFIndexType.DYNAMIC_SIZE;
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class);
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
@ -240,7 +253,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test()
public void testWrongGVCFNonVariantRecordOrderBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("ae90a1b5c40f19df38e8ac58c3844ed5"));
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
@ -257,7 +270,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("bf6325fbc06c80893f3170668cfbed06"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
@ -269,7 +282,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(enabled=true)
public void testGeneralPloidyArrayIndexBug1Fix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 1 -maxAltAlleles 2 -isr INTERSECTION -L 1:23696115-23696189",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX1_BAM, GENERAL_PLOIDY_BUGFIX1_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX1_BAM, GENERAL_PLOIDY_BUGFIX1_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF();
executeTest(" testGeneralPloidyArrayIndexBug1Fix", spec);
@ -281,7 +294,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test(enabled=true)
public void testGeneralPloidyArrayIndexBug2Fix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 2 -maxAltAlleles 2 -A DepthPerSampleHC -A StrandBiasBySample -L 1:38052860-38052937",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX2_BAM, GENERAL_PLOIDY_BUGFIX2_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX2_BAM, GENERAL_PLOIDY_BUGFIX2_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF();
executeTest(" testGeneralPloidyArrayIndexBug2Fix", spec);

View File

@ -362,7 +362,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -dontTrimActiveRegions -ERC GVCF " +
"-likelihoodEngine GraphBased -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "graphbased_no_such_edge_bug.bam", privateTestDir + "graphbased_no_such_edge_bug.intervals.bed",
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF();
executeTest("testGraphBasedNoSuchEdgeBugFix", spec);

View File

@ -26,13 +26,11 @@
package org.broadinstitute.gatk.engine;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.index.*;
import htsjdk.tribble.index.interval.IntervalTreeIndex;
import htsjdk.tribble.index.linear.LinearIndex;
import org.apache.log4j.Logger;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import htsjdk.tribble.FeatureCodecHeader;
import htsjdk.tribble.index.DynamicIndexCreator;
import htsjdk.tribble.index.IndexCreator;
import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.index.interval.IntervalIndexCreator;
import htsjdk.tribble.index.linear.LinearIndexCreator;
import htsjdk.tribble.index.tabix.TabixFormat;
@ -47,9 +45,8 @@ import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.*;
import java.lang.reflect.Field;
import java.util.*;
@ -68,6 +65,16 @@ public class GATKVCFUtils {
public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type.
public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter
// as determined experimentally Nov-Dec 2013
public final static GATKVCFIndexType DEFAULT_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
public final static Integer DEFAULT_GVCF_INDEX_PARAMETER = 128000;
// GVCF file extension
public final static String GVCF_EXT = "g.vcf";
// Message for using the deprecated --variant_index_type or --variant_index_parameter arguments.
public final static String DEPRECATED_INDEX_ARGS_MSG = "Naming your output file using the .g.vcf extension will automatically set the appropriate values " +
" for --variant_index_type and --variant_index_parameter";
/**
* Gets the appropriately formatted header for a VCF file describing this GATK run
@ -260,4 +267,124 @@ public class GATKVCFUtils {
codec.close(vcfSource);
}
}
/**
* Check if the two indices are equivalent
*
* @param thisIndex index
* @param otherIndex index
* @return true if indices are equivalent, false otherwise.
*/
public static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){
return thisIndex.getVersion() == otherIndex.getVersion() &&
thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) &&
thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() &&
thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) &&
thisIndex.getFlags() == otherIndex.getFlags();
}
/**
* Check if the two indices are equivalent for a chromosome
*
* @param thisIndex index
* @param otherIndex index
* @param chr chromosome
* @return true if indices are equivalent, false otherwise.
* @throws NoSuchFieldException if index does not exist for a chromosome
* @throws IllegalAccessException if index does not exist for a chromosome
*/
public static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException {
htsjdk.tribble.index.linear.LinearIndex.ChrIndex thisChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr);
htsjdk.tribble.index.linear.LinearIndex.ChrIndex otherChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr);
return thisChr.getName().equals(otherChr.getName()) &&
//thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ?
thisChr.getNFeatures() == otherChr.getNFeatures() &&
thisChr.getNBlocks() == otherChr.getNBlocks();
}
/**
* Check if the two interval indices are equivalent for a chromosome
*
* @param thisIndex interval index
* @param otherIndex interval index
* @param chr chromosome
* @return true if indices are equivalent, false otherwise.
* @throws NoSuchFieldException if index does not exist for a chromosome
* @throws IllegalAccessException if index does not exist for a chromosome
*/
public static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException {
htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr);
htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr);
// TODO: compare trees?
return thisChr.getName().equals(otherChr.getName());
}
/**
* Get index for a chromosome
*
* @param index index
* @param chr chromosome
* @return index for the chromosome
* @throws NoSuchFieldException if index does not exist for a chromosome
* @throws IllegalAccessException if index does not exist for a chromosome
*/
public static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException {
Field f = AbstractIndex.class.getDeclaredField("chrIndices");
f.setAccessible(true);
LinkedHashMap<String, ChrIndex> chrIndices = (LinkedHashMap<String, ChrIndex>) f.get(index);
return chrIndices.get(chr);
}
/**
* Make an IndexCreator
*
* @param variantIndexType variant indexing strategy
* @param variantIndexParameter variant indexing parameter
* @param outputFile output variant file
* @param sequenceDictionary collection of SAM sequence records
* @return IndexCreator
*/
public static IndexCreator makeIndexCreator(final GATKVCFIndexType variantIndexType, final int variantIndexParameter, final File outputFile, final SAMSequenceDictionary sequenceDictionary) {
/*
* If using the index arguments, log a warning.
* If the genotype file has the GCVF extension (.g.vcf), use the default GCVF indexing.
* Otherwise, use the default index type and parameter.
*/
GATKVCFIndexType indexType = DEFAULT_INDEX_TYPE;
int indexParameter = DEFAULT_INDEX_PARAMETER;
if (usingNonDefaultIndexingArguments(variantIndexType, variantIndexParameter)) {
indexType = variantIndexType;
indexParameter = variantIndexParameter;
logger.warn(DEPRECATED_INDEX_ARGS_MSG);
} else if (outputFile.getName().endsWith("." + GVCF_EXT)) {
indexType = DEFAULT_GVCF_INDEX_TYPE;
indexParameter = DEFAULT_GVCF_INDEX_PARAMETER;
}
return getIndexCreator(indexType, indexParameter, outputFile, sequenceDictionary);
}
/**
* Check if not using the default indexing arguments' values
*
* @param variantIndexType variant indexing strategy
* @param variantIndexParameter variant indexing parameter
* @return true if the index type or parameter are not the default values, false otherwise
*/
public static boolean usingNonDefaultIndexingArguments(final GATKVCFIndexType variantIndexType, final int variantIndexParameter) {
return variantIndexType != GATKVCFUtils.DEFAULT_INDEX_TYPE || variantIndexParameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
}
/**
* Check if using the GCVF indexing arguments' values
*
* @param variantIndexType variant indexing strategy
* @param variantIndexParameter variant indexing parameter
* @return true if the index type and parameter are the default GVCF values, false otherwise
*/
public static boolean usingGVCFIndexingArguments(final GATKVCFIndexType variantIndexType, final int variantIndexParameter) {
return variantIndexType == GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE && variantIndexParameter == GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER;
}
}

View File

@ -621,12 +621,16 @@ public class GATKArgumentCollection {
* INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter
* DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
* DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
*
* This argument is deprecated, using the output file ".g.vcf" extension will automatically set the appropriate value
*/
@Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false)
@Advanced
public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
/**
* This is either the bin width or the number of features per bin, depending on the indexing strategy
*
* This argument is deprecated, using the output file ".g.vcf" extension will automatically set the appropriate value
*/
@Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false)
@Advanced

View File

@ -130,7 +130,9 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
this.engine = engine;
this.genotypeFile = genotypeFile;
this.genotypeStream = null;
this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile);
this.indexCreator = GATKVCFUtils.makeIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter,
genotypeFile, null);
this.argumentSources = argumentSources;
}

View File

@ -266,12 +266,11 @@ public class CatVariants extends CommandLineProgram {
reader.close();
priorityQueue.add(new Pair<>(firstPosition,file));
}
}
FileOutputStream outputStream = new FileOutputStream(outputFile);
EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary());
IndexCreator idxCreator = GATKVCFUtils.makeIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary());
final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);
boolean firstFile = true;

View File

@ -24,16 +24,21 @@
*/
package org.broadinstitute.gatk.tools;
import org.apache.commons.lang.StringUtils;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.index.IndexFactory;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.tribble.AbstractFeatureReader;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.MD5DB;
import org.broadinstitute.gatk.utils.MD5Mismatch;
import org.broadinstitute.gatk.utils.runtime.ProcessController;
import org.broadinstitute.gatk.utils.runtime.ProcessSettings;
import org.broadinstitute.gatk.utils.runtime.RuntimeUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
import org.testng.Assert;
import org.testng.TestException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -142,4 +147,82 @@ public class CatVariantsIntegrationTest {
ProcessSettings ps = new ProcessSettings(cmdLine.split("\\s+"));
pc.execAndCheck(ps);
}
//
//
// IndexCreator tests
//
//
private class VCFIndexCreatorTest extends BaseTest.TestDataProvider {
private final GATKVCFIndexType type;
private final int parameter;
private VCFIndexCreatorTest(GATKVCFIndexType type, int parameter) {
super(VCFIndexCreatorTest.class);
this.type = type;
this.parameter = parameter;
}
public String toString() {
return String.format("Index Type %s, Index Parameter %s", type, parameter);
}
public Index getIndex(final File vcfFile) {
switch (type) {
case DYNAMIC_SEEK : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
case DYNAMIC_SIZE : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SIZE);
case LINEAR : return IndexFactory.createLinearIndex(vcfFile, new VCFCodec(), parameter);
case INTERVAL : return IndexFactory.createIntervalIndex(vcfFile, new VCFCodec(), parameter);
default : throw new TestException("Invalid index type");
}
}
}
@DataProvider(name = "IndexDataProvider")
public Object[][] indexCreatorData() {
new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0);
new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0);
new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 100);
new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 10000);
new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 20);
new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 2000);
return BaseTest.TestDataProvider.getTests(VCFIndexCreatorTest.class);
}
@Test(dataProvider = "IndexDataProvider")
public void testCatVariantsVCFIndexCreation(VCFIndexCreatorTest testSpec) throws IOException{
String cmdLine = String.format("java -cp %s %s -R %s -V %s -V %s --variant_index_type %s --variant_index_parameter %s -out %s",
StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar),
CatVariants.class.getCanonicalName(),
BaseTest.b37KGReference,
CatVariantsVcf1,
CatVariantsVcf2,
testSpec.type,
testSpec.parameter,
BaseTest.createTempFile("CatVariantsVCFIndexCreationTest", ".vcf"));
ProcessController pc = ProcessController.getThreadLocal();
ProcessSettings ps = new ProcessSettings(cmdLine.split("\\s+"));
pc.execAndCheck(ps);
}
@Test()
public void testCatVariantsGVCFIndexCreation() throws IOException{
String cmdLine = String.format("java -cp %s %s -R %s -V %s -V %s -out %s",
StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar),
CatVariants.class.getCanonicalName(),
BaseTest.b37KGReference,
CatVariantsVcf1,
CatVariantsVcf2,
BaseTest.createTempFile("CatVariantsGVCFIndexCreationTest", "." + GATKVCFUtils.GVCF_EXT));
ProcessController pc = ProcessController.getThreadLocal();
ProcessSettings ps = new ProcessSettings(cmdLine.split("\\s+"));
pc.execAndCheck(ps);
}
}

View File

@ -28,16 +28,17 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.Tribble;
import htsjdk.tribble.index.AbstractIndex;
import htsjdk.tribble.index.ChrIndex;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.index.interval.IntervalTreeIndex;
import htsjdk.tribble.index.linear.LinearIndex;
import htsjdk.tribble.index.tabix.TabixIndex;
import htsjdk.tribble.util.TabixUtils;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import htsjdk.variant.vcf.VCFCodec;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
import org.testng.Assert;
import org.testng.TestException;
@ -45,9 +46,8 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.lang.reflect.Field;
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
public class VCFIntegrationTest extends WalkerTest {
@ -205,77 +205,67 @@ public class VCFIntegrationTest extends WalkerTest {
}
@Test(dataProvider = "IndexDataProvider")
public void testVCFIndexCreation(VCFIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException {
public void testVCFIndexCreation(VCFIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException, IOException {
final String logFileName = new String("testVCFIndexCreation.log");
final String chromosome = "20";
final String commandLine = " -T SelectVariants" +
" -R " + b37KGReference +
" --no_cmdline_in_header" +
" -L 20" +
" -L " + chromosome +
" -V " + b37_NA12878_OMNI +
" --variant_index_type " + testSpec.type +
" --variant_index_parameter " + testSpec.parameter +
" -o %s ";
" -log " + logFileName +
" -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList(""));
spec.disableShadowBCF();
final String name = "testVCFIndexCreation: " + testSpec.toString();
// execute that test and check if the actual and expected indices are the same
executeTestAndCheckIndices(name, chromosome, testSpec, spec);
// check the log for the warning message
File file = new File(logFileName);
Assert.assertTrue(FileUtils.readFileToString(file).contains(GATKVCFUtils.DEPRECATED_INDEX_ARGS_MSG));
}
@Test
public void testVCFIndexCreationNoArgs() throws NoSuchFieldException, IllegalAccessException {
final String chromosome = "20";
final String commandLine = " -T SelectVariants" +
" -R " + b37KGReference +
" --no_cmdline_in_header" +
" -L " + chromosome +
" -V " + b37_NA12878_OMNI +
" -o %s";
final String name = "testVCFIndexCreationNoArgs";
VCFIndexCreatorTest testSpec = new VCFIndexCreatorTest(GATKVCFUtils.DEFAULT_INDEX_TYPE, GATKVCFUtils.DEFAULT_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList(""));
spec.disableShadowBCF();
File outVCF = executeTest(name, spec).first.get(0);
File outIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION);
final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath());
final Index expectedIndex = testSpec.getIndex(outVCF);
if (testSpec.type.equals("LINEAR"))
Assert.assertTrue(actualIndex instanceof LinearIndex, "Index is not a LinearIndex");
else if (testSpec.type.equals("INTERVAL"))
Assert.assertTrue(actualIndex instanceof IntervalTreeIndex, "Index is not a IntervalTreeIndex");
// dynamic indices ultimately resolve to one of LinearIndex or IntervalTreeIndex
Assert.assertTrue(equivalentAbstractIndices((AbstractIndex)actualIndex, (AbstractIndex)expectedIndex), "Indices are not equivalent");
if (actualIndex instanceof LinearIndex && expectedIndex instanceof LinearIndex) {
Assert.assertTrue(equivalentLinearIndices((LinearIndex)actualIndex, (LinearIndex)expectedIndex, "20"), "Linear indices are not equivalent");
}
else if (actualIndex instanceof IntervalTreeIndex && expectedIndex instanceof IntervalTreeIndex) {
Assert.assertTrue(equivalentIntervalIndices((IntervalTreeIndex)actualIndex, (IntervalTreeIndex)expectedIndex, "20"), "Interval indices are not equivalent");
}
else {
Assert.fail("Indices are not of the same type");
}
// execute that test and check if the actual and expected indices are the same
executeTestAndCheckIndices(name, chromosome, testSpec, spec);
}
private static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){
return thisIndex.getVersion() == otherIndex.getVersion() &&
thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) &&
thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() &&
thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) &&
thisIndex.getFlags() == otherIndex.getFlags();
}
@Test
public void testGVCFIndexCreation() throws NoSuchFieldException, IllegalAccessException {
private static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException {
htsjdk.tribble.index.linear.LinearIndex.ChrIndex thisChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr);
htsjdk.tribble.index.linear.LinearIndex.ChrIndex otherChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr);
final String chromosome = "20";
final String commandLine = " -T SelectVariants" +
" -R " + b37KGReference +
" --no_cmdline_in_header" +
" -L " + chromosome +
" -V " + b37_NA12878_OMNI +
" -o %s";
final String name = "testGVCFIndexCreation";
VCFIndexCreatorTest testSpec = new VCFIndexCreatorTest(GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, Arrays.asList(GATKVCFUtils.GVCF_EXT), Arrays.asList(""));
spec.disableShadowBCF();
return thisChr.getName().equals(otherChr.getName()) &&
//thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ?
thisChr.getNFeatures() == otherChr.getNFeatures() &&
thisChr.getNBlocks() == otherChr.getNBlocks();
}
private static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException {
htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr);
htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr);
// TODO: compare trees?
return thisChr.getName().equals(otherChr.getName());
}
private static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException {
Field f = AbstractIndex.class.getDeclaredField("chrIndices");
f.setAccessible(true);
LinkedHashMap<String, ChrIndex> chrIndices = (LinkedHashMap<String, ChrIndex>) f.get(index);
return chrIndices.get(chr);
// execute that test and check if the actual and expected indices are the same
executeTestAndCheckIndices(name, chromosome, testSpec, spec);
}
//
@ -375,4 +365,31 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest(name, spec);
}
private void executeTestAndCheckIndices(final String name, final String chr, final VCFIndexCreatorTest testSpec, final WalkerTestSpec walkerTestSpec)
throws NoSuchFieldException, IllegalAccessException {
File outVCF = executeTest(name, walkerTestSpec).first.get(0);
File outIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION);
final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath());
final Index expectedIndex = testSpec.getIndex(outVCF);
if (testSpec.type.equals("LINEAR"))
Assert.assertTrue(actualIndex instanceof LinearIndex, "Index is not a LinearIndex");
else if (testSpec.type.equals("INTERVAL"))
Assert.assertTrue(actualIndex instanceof IntervalTreeIndex, "Index is not a IntervalTreeIndex");
// dynamic indices ultimately resolve to one of LinearIndex or IntervalTreeIndex
Assert.assertTrue(GATKVCFUtils.equivalentAbstractIndices((AbstractIndex) actualIndex, (AbstractIndex) expectedIndex), "Indices are not equivalent");
if (actualIndex instanceof LinearIndex && expectedIndex instanceof LinearIndex) {
Assert.assertTrue(GATKVCFUtils.equivalentLinearIndices((LinearIndex) actualIndex, (LinearIndex) expectedIndex, chr), "Linear indices are not equivalent");
}
else if (actualIndex instanceof IntervalTreeIndex && expectedIndex instanceof IntervalTreeIndex) {
Assert.assertTrue(GATKVCFUtils.equivalentIntervalIndices((IntervalTreeIndex) actualIndex, (IntervalTreeIndex) expectedIndex, chr), "Interval indices are not equivalent");
}
else {
Assert.fail("Indices are not of the same type");
}
}
}