Fix bug in CombineGVCFs so now sample 2 variants occuring within sample 1 deletions get merged properly.
CombineGVCFs now outputs ref conf for the duration of deletions so that SNPs occuring in other samples aligned with those deletions will be genotyped correctly
This commit is contained in:
parent
c14fccb571
commit
c09667a20d
|
|
@ -112,10 +112,14 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
protected final class PositionalState {
|
||||
final List<VariantContext> VCs;
|
||||
final Set<String> samples = new HashSet<>();
|
||||
final byte[] refBases;
|
||||
final GenomeLoc loc;
|
||||
public PositionalState(final List<VariantContext> VCs, final byte[] refBases, final GenomeLoc loc) {
|
||||
this.VCs = VCs;
|
||||
for(final VariantContext vc : VCs){
|
||||
samples.addAll(vc.getSampleNames());
|
||||
}
|
||||
this.refBases = refBases;
|
||||
this.loc = loc;
|
||||
}
|
||||
|
|
@ -123,6 +127,7 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
protected final class OverallState {
|
||||
final LinkedList<VariantContext> VCs = new LinkedList<>();
|
||||
final Set<String> samples = new HashSet<>();
|
||||
GenomeLoc prevPos = null;
|
||||
byte refAfterPrevPos;
|
||||
|
||||
|
|
@ -179,13 +184,17 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
final int currentPos = startingStates.loc.getStart();
|
||||
|
||||
if ( !startingStates.VCs.isEmpty() ) {
|
||||
if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) )
|
||||
endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]);
|
||||
if ( ! okayToSkipThisSite(startingStates, previousState) )
|
||||
endPreviousStates(previousState, currentPos - 1, startingStates, false);
|
||||
previousState.VCs.addAll(startingStates.VCs);
|
||||
for(final VariantContext vc : previousState.VCs){
|
||||
previousState.samples.addAll(vc.getSampleNames());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, currentPos) ) {
|
||||
endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N');
|
||||
endPreviousStates(previousState, currentPos, startingStates, true);
|
||||
}
|
||||
|
||||
return previousState;
|
||||
|
|
@ -194,12 +203,18 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
/**
|
||||
* Is it okay to skip the given position?
|
||||
*
|
||||
* @param thisPos this position
|
||||
* @param lastPosRun the last position for which we created a VariantContext
|
||||
* @param startingStates state information for this position
|
||||
* @param previousState state information for the last position for which we created a VariantContext
|
||||
* @return true if it is okay to skip this position, false otherwise
|
||||
*/
|
||||
private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) {
|
||||
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1;
|
||||
private boolean okayToSkipThisSite(final PositionalState startingStates, final OverallState previousState) {
|
||||
final int thisPos = startingStates.loc.getStart();
|
||||
final GenomeLoc lastPosRun = previousState.prevPos;
|
||||
Set<String> intersection = new HashSet<String>(startingStates.samples);
|
||||
intersection.retainAll(previousState.samples);
|
||||
|
||||
//if there's a starting VC with a sample that's already in a current VC, don't skip this position
|
||||
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -237,9 +252,15 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
*
|
||||
* @param state the state with list of VariantContexts
|
||||
* @param pos the target ending position
|
||||
* @param refBase the reference base to use at the position AFTER pos
|
||||
* @param startingStates the state for the starting VCs
|
||||
*/
|
||||
private void endPreviousStates(final OverallState state, final int pos, final byte refBase) {
|
||||
private void endPreviousStates(final OverallState state, final int pos, final PositionalState startingStates, boolean atCurrentPosition) {
|
||||
|
||||
final byte refBase;
|
||||
if (atCurrentPosition)
|
||||
refBase = startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N';
|
||||
else
|
||||
refBase = startingStates.refBases[0];
|
||||
|
||||
final List<VariantContext> stoppedVCs = new ArrayList<>(state.VCs.size());
|
||||
|
||||
|
|
@ -250,12 +271,24 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
stoppedVCs.add(vc);
|
||||
|
||||
// if it was ending anyways, then remove it from the future state
|
||||
if ( isEndingContext(vc, pos) )
|
||||
if ( vc.getEnd() == pos) {
|
||||
state.samples.removeAll(vc.getSampleNames());
|
||||
state.VCs.remove(i);
|
||||
continue; //don't try to remove twice
|
||||
}
|
||||
|
||||
//if ending vc is the same sample as a starting VC, then remove it from the future state
|
||||
if(startingStates.VCs.size() > 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) {
|
||||
state.samples.removeAll(vc.getSampleNames());
|
||||
state.VCs.remove(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !stoppedVCs.isEmpty() ) {
|
||||
//if we already output this state in BP resolution mode (reflected by state.prevPos) then skip output
|
||||
GenomeLoc lastWriteLoc = state.prevPos;
|
||||
final int lastWritePos = lastWriteLoc == null ? -1 : lastWriteLoc.getStart();
|
||||
if ( !stoppedVCs.isEmpty() && pos > lastWritePos) {
|
||||
final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos);
|
||||
|
||||
// we need the specialized merge if the site contains anything other than ref blocks
|
||||
|
|
|
|||
|
|
@ -315,7 +315,7 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
|
||||
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
|
||||
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
|
||||
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart());
|
||||
int[] perSampleIndexesOfRelevantAlleles;
|
||||
|
||||
for ( final Genotype g : VC.getGenotypes() ) {
|
||||
final String name = g.getSampleName();
|
||||
|
|
@ -325,11 +325,12 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
|
||||
if (g.hasPL()) {
|
||||
// lazy initialization of the genotype index map by ploidy.
|
||||
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g);
|
||||
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
|
||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles)
|
||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
|
||||
: genotypeIndexMapsByPloidy[ploidy];
|
||||
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
|
||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null;
|
||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
|
||||
genotypeBuilder.PL(PLs).AD(AD).noGQ();
|
||||
}
|
||||
mergedGenotypes.add(genotypeBuilder.make());
|
||||
|
|
@ -364,27 +365,55 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
*
|
||||
* @param remappedAlleles the list of alleles to evaluate
|
||||
* @param targetAlleles the target list of alleles
|
||||
* @param position position to use for error messages
|
||||
* @param position position to output error info
|
||||
* @param g genotype from which targetAlleles are derived
|
||||
* @return non-null array of ints representing indexes
|
||||
*/
|
||||
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position) {
|
||||
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position, final Genotype g) {
|
||||
|
||||
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
|
||||
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
|
||||
|
||||
if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
|
||||
throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
|
||||
final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
|
||||
final int indexOfNonRef = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
|
||||
//if the refs don't match then let the non-ref allele be the most likely of the alts
|
||||
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
|
||||
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
|
||||
final int indexOfBestAlt;
|
||||
if (!refsMatch && g.hasPL()) {
|
||||
final int[] PLs = g.getPL().clone();
|
||||
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
|
||||
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
|
||||
indexOfBestAlt = pair.alleleIndex2;
|
||||
}
|
||||
else
|
||||
indexOfBestAlt = indexOfNonRef;
|
||||
|
||||
final int[] indexMapping = new int[targetAlleles.size()];
|
||||
|
||||
// the reference alleles always match up (even if they don't appear to)
|
||||
// the reference likelihoods should always map to each other (even if the alleles don't)
|
||||
indexMapping[0] = 0;
|
||||
|
||||
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
|
||||
for ( int i = 1; i < targetAlleles.size(); i++ ) {
|
||||
final int targetNonRef = targetAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
final boolean targetHasNonRef = targetNonRef != -1;
|
||||
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
|
||||
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
|
||||
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
|
||||
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele;
|
||||
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||
}
|
||||
if (targetHasNonRef) {
|
||||
if (refsMatch) {
|
||||
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
|
||||
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||
}
|
||||
else {
|
||||
indexMapping[targetNonRef] = indexOfBestAlt;
|
||||
}
|
||||
}
|
||||
|
||||
return indexMapping;
|
||||
|
|
|
|||
|
|
@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testBasepairResolutionInput() throws Exception {
|
||||
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eb56760419b295651b6d54ba3ad18f52"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBasepairResolutionInput", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,7 +132,8 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
|||
alleles1.add(Allele.create("A", true));
|
||||
final List<Allele> alleles2 = new ArrayList<>(1);
|
||||
alleles2.add(Allele.create("A", true));
|
||||
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
|
||||
GenotypeBuilder builder = new GenotypeBuilder();
|
||||
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make());
|
||||
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
|
||||
}
|
||||
|
||||
|
|
@ -147,7 +148,9 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
|||
if ( allelesIndex > 0 )
|
||||
myAlleles.add(allAlleles.get(allelesIndex));
|
||||
|
||||
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
|
||||
GenotypeBuilder builder = new GenotypeBuilder();
|
||||
|
||||
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make());
|
||||
|
||||
Assert.assertEquals(indexes.length, allAlleles.size());
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue