Adding option to CombineGVCFs to have it break blocks at every N sites.
Using --breakBandsAtMultiplesOf N will ensure that no reference blocks span across genomic positions that are multiples of N. This is especially important in the case of scatter-gather where you don't want your scatter intervals to start in the middle of blocks (because of a limitation in the way -L works in the GATK for VCF records with the END tag). For example, running with --breakBandsAtMultiplesOf 5 on this record: 1 69491 . G <NON_REF> . . END=69523 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 Will produce the following records: 1 69491 . G <NON_REF> . . END=69494 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 1 69495 . C <NON_REF> . . END=69499 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 1 69500 . T <NON_REF> . . END=69504 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 etc. Added docs and a new test.
This commit is contained in:
parent
e1862a04a8
commit
ea8a1edeb6
|
|
@ -148,6 +148,16 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
@Argument(fullName="convertToBasePairResolution", shortName="bpResolution", doc = "If specified, convert banded gVCFs to all-sites gVCFs", required=false)
|
||||
protected boolean USE_BP_RESOLUTION = false;
|
||||
|
||||
/**
|
||||
* To reduce file sizes our gVCFs group similar reference positions into bands. However, there are cases when users will want to know that no bands
|
||||
* span across a given genomic position (e.g. when scatter-gathering jobs across a compute farm). The option below enables users to break bands at
|
||||
* pre-defined positions. For example, a value of 10,000 would mean that we would ensure that no bands span across chr1:10000, chr1:20000, etc.
|
||||
*
|
||||
* Note that the --convertToBasePairResolution argument is just a special case of this argument with a value of 1.
|
||||
*/
|
||||
@Argument(fullName="breakBandsAtMultiplesOf", shortName="breakBandsAtMultiplesOf", doc = "If > 0, reference bands will be broken up at genomic positions that are multiples of this number", required=false)
|
||||
protected int multipleAtWhichToBreakBands = 0;
|
||||
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
public void initialize() {
|
||||
|
|
@ -164,6 +174,10 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
variants.addAll(variantCollection.getRodBindings());
|
||||
|
||||
genomeLocParser = getToolkit().getGenomeLocParser();
|
||||
|
||||
// optimization to prevent mods when we always just want to break bands
|
||||
if ( multipleAtWhichToBreakBands == 1 )
|
||||
USE_BP_RESOLUTION = true;
|
||||
}
|
||||
|
||||
public PositionalState map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
|
|
@ -192,13 +206,25 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
}
|
||||
|
||||
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, startingStates.loc.getStart()) ) {
|
||||
if ( breakBand(startingStates.loc) || containsEndingContext(previousState.VCs, startingStates.loc.getStart()) ) {
|
||||
endPreviousStates(previousState, startingStates.loc, startingStates, true);
|
||||
}
|
||||
|
||||
return previousState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we break bands at the given position?
|
||||
*
|
||||
* @param loc the genomic location to evaluate against
|
||||
*
|
||||
* @return true if we should ensure that bands should be broken at the given position, false otherwise
|
||||
*/
|
||||
private boolean breakBand(final GenomeLoc loc) {
|
||||
return USE_BP_RESOLUTION ||
|
||||
(loc != null && multipleAtWhichToBreakBands > 0 && (loc.getStart()+1) % multipleAtWhichToBreakBands == 0); // add +1 to the loc because we want to break BEFORE this base
|
||||
}
|
||||
|
||||
/**
|
||||
* Is it okay to skip the given position?
|
||||
*
|
||||
|
|
|
|||
|
|
@ -203,6 +203,14 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
|||
executeTest("testBasepairResolutionOutput", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBreakBlocks() throws Exception {
|
||||
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("3a8e53b8b590eaa2675149ceccb80a7a"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBreakBlocks", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWrongReferenceBaseBugFix() throws Exception {
|
||||
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"
|
||||
|
|
|
|||
Loading…
Reference in New Issue