Adding option to CombineGVCFs to have it break blocks at every N sites.

Using --breakBandsAtMultiplesOf N will ensure that no reference blocks span across
genomic positions that are multiples of N.  This is especially important in the
case of scatter-gather where you don't want your scatter intervals to start in the
middle of blocks (because of a limitation in the way -L works in the GATK for VCF
records with the END tag).

For example, running with --breakBandsAtMultiplesOf 5 on this record:
1       69491   .       G       <NON_REF>       .       .       END=69523       GT:DP:GQ:MIN_DP:MIN_GQ:PL       ./.:94:99:82:99:0,120,1800

Will produce the following records:
1       69491   .       G       <NON_REF>       .       .       END=69494       GT:DP:GQ:MIN_DP:MIN_GQ:PL       ./.:94:99:82:99:0,120,1800
1       69495   .       C       <NON_REF>       .       .       END=69499       GT:DP:GQ:MIN_DP:MIN_GQ:PL       ./.:94:99:82:99:0,120,1800
1       69500   .       T       <NON_REF>       .       .       END=69504       GT:DP:GQ:MIN_DP:MIN_GQ:PL       ./.:94:99:82:99:0,120,1800
etc.

Added docs and a new test.
This commit is contained in:
Eric Banks 2015-03-12 13:29:52 -04:00
parent e1862a04a8
commit ea8a1edeb6
2 changed files with 35 additions and 1 deletions

View File

@ -148,6 +148,16 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
@Argument(fullName="convertToBasePairResolution", shortName="bpResolution", doc = "If specified, convert banded gVCFs to all-sites gVCFs", required=false)
protected boolean USE_BP_RESOLUTION = false;
/**
* To reduce file sizes our gVCFs group similar reference positions into bands. However, there are cases when users will want to know that no bands
* span across a given genomic position (e.g. when scatter-gathering jobs across a compute farm). The option below enables users to break bands at
* pre-defined positions. For example, a value of 10,000 would mean that we would ensure that no bands span across chr1:10000, chr1:20000, etc.
*
* Note that the --convertToBasePairResolution argument is just a special case of this argument with a value of 1.
*/
@Argument(fullName="breakBandsAtMultiplesOf", shortName="breakBandsAtMultiplesOf", doc = "If > 0, reference bands will be broken up at genomic positions that are multiples of this number", required=false)
protected int multipleAtWhichToBreakBands = 0;
private GenomeLocParser genomeLocParser;
public void initialize() {
@ -164,6 +174,10 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
variants.addAll(variantCollection.getRodBindings());
genomeLocParser = getToolkit().getGenomeLocParser();
// optimization to prevent mods when we always just want to break bands
if ( multipleAtWhichToBreakBands == 1 )
USE_BP_RESOLUTION = true;
}
public PositionalState map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
@ -192,13 +206,25 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
}
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, startingStates.loc.getStart()) ) {
if ( breakBand(startingStates.loc) || containsEndingContext(previousState.VCs, startingStates.loc.getStart()) ) {
endPreviousStates(previousState, startingStates.loc, startingStates, true);
}
return previousState;
}
/**
* Should we break bands at the given position?
*
* @param loc the genomic location to evaluate against
*
* @return true if we should ensure that bands should be broken at the given position, false otherwise
*/
private boolean breakBand(final GenomeLoc loc) {
return USE_BP_RESOLUTION ||
(loc != null && multipleAtWhichToBreakBands > 0 && (loc.getStart()+1) % multipleAtWhichToBreakBands == 0); // add +1 to the loc because we want to break BEFORE this base
}
/**
* Is it okay to skip the given position?
*

View File

@ -203,6 +203,14 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
executeTest("testBasepairResolutionOutput", spec);
}
@Test
public void testBreakBlocks() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("3a8e53b8b590eaa2675149ceccb80a7a"));
spec.disableShadowBCF();
executeTest("testBreakBlocks", spec);
}
@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"