Merge pull request #763 from broadinstitute/ldg_combineGVCFsFix
Fix bug in CombineGVCFs so now sample 2 variants occurring within sample ...
This commit is contained in:
commit
31cb47b9e6
|
|
@ -112,10 +112,14 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
protected final class PositionalState {
|
||||
final List<VariantContext> VCs;
|
||||
final Set<String> samples = new HashSet<>();
|
||||
final byte[] refBases;
|
||||
final GenomeLoc loc;
|
||||
public PositionalState(final List<VariantContext> VCs, final byte[] refBases, final GenomeLoc loc) {
|
||||
this.VCs = VCs;
|
||||
for(final VariantContext vc : VCs){
|
||||
samples.addAll(vc.getSampleNames());
|
||||
}
|
||||
this.refBases = refBases;
|
||||
this.loc = loc;
|
||||
}
|
||||
|
|
@ -123,6 +127,7 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
protected final class OverallState {
|
||||
final LinkedList<VariantContext> VCs = new LinkedList<>();
|
||||
final Set<String> samples = new HashSet<>();
|
||||
GenomeLoc prevPos = null;
|
||||
byte refAfterPrevPos;
|
||||
|
||||
|
|
@ -179,13 +184,17 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
final int currentPos = startingStates.loc.getStart();
|
||||
|
||||
if ( !startingStates.VCs.isEmpty() ) {
|
||||
if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) )
|
||||
endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]);
|
||||
if ( ! okayToSkipThisSite(startingStates, previousState) )
|
||||
endPreviousStates(previousState, currentPos - 1, startingStates, false);
|
||||
previousState.VCs.addAll(startingStates.VCs);
|
||||
for(final VariantContext vc : previousState.VCs){
|
||||
previousState.samples.addAll(vc.getSampleNames());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, currentPos) ) {
|
||||
endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N');
|
||||
endPreviousStates(previousState, currentPos, startingStates, true);
|
||||
}
|
||||
|
||||
return previousState;
|
||||
|
|
@ -194,12 +203,18 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
/**
|
||||
* Is it okay to skip the given position?
|
||||
*
|
||||
* @param thisPos this position
|
||||
* @param lastPosRun the last position for which we created a VariantContext
|
||||
* @param startingStates state information for this position
|
||||
* @param previousState state information for the last position for which we created a VariantContext
|
||||
* @return true if it is okay to skip this position, false otherwise
|
||||
*/
|
||||
private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) {
|
||||
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1;
|
||||
private boolean okayToSkipThisSite(final PositionalState startingStates, final OverallState previousState) {
|
||||
final int thisPos = startingStates.loc.getStart();
|
||||
final GenomeLoc lastPosRun = previousState.prevPos;
|
||||
Set<String> intersection = new HashSet<String>(startingStates.samples);
|
||||
intersection.retainAll(previousState.samples);
|
||||
|
||||
//if there's a starting VC with a sample that's already in a current VC, don't skip this position
|
||||
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -237,9 +252,15 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
*
|
||||
* @param state the state with list of VariantContexts
|
||||
* @param pos the target ending position
|
||||
* @param refBase the reference base to use at the position AFTER pos
|
||||
* @param startingStates the state for the starting VCs
|
||||
*/
|
||||
private void endPreviousStates(final OverallState state, final int pos, final byte refBase) {
|
||||
private void endPreviousStates(final OverallState state, final int pos, final PositionalState startingStates, boolean atCurrentPosition) {
|
||||
|
||||
final byte refBase;
|
||||
if (atCurrentPosition)
|
||||
refBase = startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N';
|
||||
else
|
||||
refBase = startingStates.refBases[0];
|
||||
|
||||
final List<VariantContext> stoppedVCs = new ArrayList<>(state.VCs.size());
|
||||
|
||||
|
|
@ -250,12 +271,24 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
stoppedVCs.add(vc);
|
||||
|
||||
// if it was ending anyways, then remove it from the future state
|
||||
if ( isEndingContext(vc, pos) )
|
||||
if ( vc.getEnd() == pos) {
|
||||
state.samples.removeAll(vc.getSampleNames());
|
||||
state.VCs.remove(i);
|
||||
continue; //don't try to remove twice
|
||||
}
|
||||
|
||||
//if ending vc is the same sample as a starting VC, then remove it from the future state
|
||||
if(startingStates.VCs.size() > 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) {
|
||||
state.samples.removeAll(vc.getSampleNames());
|
||||
state.VCs.remove(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !stoppedVCs.isEmpty() ) {
|
||||
//if we already output this state in BP resolution mode (reflected by state.prevPos) then skip output
|
||||
GenomeLoc lastWriteLoc = state.prevPos;
|
||||
final int lastWritePos = lastWriteLoc == null ? -1 : lastWriteLoc.getStart();
|
||||
if ( !stoppedVCs.isEmpty() && pos > lastWritePos) {
|
||||
final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos);
|
||||
|
||||
// we need the specialized merge if the site contains anything other than ref blocks
|
||||
|
|
|
|||
|
|
@ -315,7 +315,7 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
|
||||
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
|
||||
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
|
||||
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart());
|
||||
int[] perSampleIndexesOfRelevantAlleles;
|
||||
|
||||
for ( final Genotype g : VC.getGenotypes() ) {
|
||||
final String name = g.getSampleName();
|
||||
|
|
@ -325,11 +325,12 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
|
||||
if (g.hasPL()) {
|
||||
// lazy initialization of the genotype index map by ploidy.
|
||||
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g);
|
||||
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
|
||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles)
|
||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
|
||||
: genotypeIndexMapsByPloidy[ploidy];
|
||||
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
|
||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null;
|
||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
|
||||
genotypeBuilder.PL(PLs).AD(AD).noGQ();
|
||||
}
|
||||
mergedGenotypes.add(genotypeBuilder.make());
|
||||
|
|
@ -364,27 +365,55 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
*
|
||||
* @param remappedAlleles the list of alleles to evaluate
|
||||
* @param targetAlleles the target list of alleles
|
||||
* @param position position to use for error messages
|
||||
* @param position position to output error info
|
||||
* @param g genotype from which targetAlleles are derived
|
||||
* @return non-null array of ints representing indexes
|
||||
*/
|
||||
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position) {
|
||||
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position, final Genotype g) {
|
||||
|
||||
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
|
||||
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
|
||||
|
||||
if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
|
||||
throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
|
||||
final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
|
||||
final int indexOfNonRef = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
|
||||
//if the refs don't match then let the non-ref allele be the most likely of the alts
|
||||
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
|
||||
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
|
||||
final int indexOfBestAlt;
|
||||
if (!refsMatch && g.hasPL()) {
|
||||
final int[] PLs = g.getPL().clone();
|
||||
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
|
||||
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
|
||||
indexOfBestAlt = pair.alleleIndex2;
|
||||
}
|
||||
else
|
||||
indexOfBestAlt = indexOfNonRef;
|
||||
|
||||
final int[] indexMapping = new int[targetAlleles.size()];
|
||||
|
||||
// the reference alleles always match up (even if they don't appear to)
|
||||
// the reference likelihoods should always map to each other (even if the alleles don't)
|
||||
indexMapping[0] = 0;
|
||||
|
||||
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
|
||||
for ( int i = 1; i < targetAlleles.size(); i++ ) {
|
||||
final int targetNonRef = targetAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
final boolean targetHasNonRef = targetNonRef != -1;
|
||||
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
|
||||
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
|
||||
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
|
||||
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele;
|
||||
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||
}
|
||||
if (targetHasNonRef) {
|
||||
if (refsMatch) {
|
||||
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
|
||||
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||
}
|
||||
else {
|
||||
indexMapping[targetNonRef] = indexOfBestAlt;
|
||||
}
|
||||
}
|
||||
|
||||
return indexMapping;
|
||||
|
|
|
|||
|
|
@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testBasepairResolutionInput() throws Exception {
|
||||
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eb56760419b295651b6d54ba3ad18f52"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBasepairResolutionInput", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,7 +132,8 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
|||
alleles1.add(Allele.create("A", true));
|
||||
final List<Allele> alleles2 = new ArrayList<>(1);
|
||||
alleles2.add(Allele.create("A", true));
|
||||
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
|
||||
GenotypeBuilder builder = new GenotypeBuilder();
|
||||
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make());
|
||||
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
|
||||
}
|
||||
|
||||
|
|
@ -147,7 +148,9 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
|||
if ( allelesIndex > 0 )
|
||||
myAlleles.add(allAlleles.get(allelesIndex));
|
||||
|
||||
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
|
||||
GenotypeBuilder builder = new GenotypeBuilder();
|
||||
|
||||
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make());
|
||||
|
||||
Assert.assertEquals(indexes.length, allAlleles.size());
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue