Merge pull request #763 from broadinstitute/ldg_combineGVCFsFix
Fix bug in CombineGVCFs so now sample 2 variants occurring within sample ...
This commit is contained in:
commit
31cb47b9e6
|
|
@ -112,10 +112,14 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
|
|
||||||
protected final class PositionalState {
|
protected final class PositionalState {
|
||||||
final List<VariantContext> VCs;
|
final List<VariantContext> VCs;
|
||||||
|
final Set<String> samples = new HashSet<>();
|
||||||
final byte[] refBases;
|
final byte[] refBases;
|
||||||
final GenomeLoc loc;
|
final GenomeLoc loc;
|
||||||
public PositionalState(final List<VariantContext> VCs, final byte[] refBases, final GenomeLoc loc) {
|
public PositionalState(final List<VariantContext> VCs, final byte[] refBases, final GenomeLoc loc) {
|
||||||
this.VCs = VCs;
|
this.VCs = VCs;
|
||||||
|
for(final VariantContext vc : VCs){
|
||||||
|
samples.addAll(vc.getSampleNames());
|
||||||
|
}
|
||||||
this.refBases = refBases;
|
this.refBases = refBases;
|
||||||
this.loc = loc;
|
this.loc = loc;
|
||||||
}
|
}
|
||||||
|
|
@ -123,6 +127,7 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
|
|
||||||
protected final class OverallState {
|
protected final class OverallState {
|
||||||
final LinkedList<VariantContext> VCs = new LinkedList<>();
|
final LinkedList<VariantContext> VCs = new LinkedList<>();
|
||||||
|
final Set<String> samples = new HashSet<>();
|
||||||
GenomeLoc prevPos = null;
|
GenomeLoc prevPos = null;
|
||||||
byte refAfterPrevPos;
|
byte refAfterPrevPos;
|
||||||
|
|
||||||
|
|
@ -179,13 +184,17 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
final int currentPos = startingStates.loc.getStart();
|
final int currentPos = startingStates.loc.getStart();
|
||||||
|
|
||||||
if ( !startingStates.VCs.isEmpty() ) {
|
if ( !startingStates.VCs.isEmpty() ) {
|
||||||
if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) )
|
if ( ! okayToSkipThisSite(startingStates, previousState) )
|
||||||
endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]);
|
endPreviousStates(previousState, currentPos - 1, startingStates, false);
|
||||||
previousState.VCs.addAll(startingStates.VCs);
|
previousState.VCs.addAll(startingStates.VCs);
|
||||||
|
for(final VariantContext vc : previousState.VCs){
|
||||||
|
previousState.samples.addAll(vc.getSampleNames());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, currentPos) ) {
|
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, currentPos) ) {
|
||||||
endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N');
|
endPreviousStates(previousState, currentPos, startingStates, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
return previousState;
|
return previousState;
|
||||||
|
|
@ -194,12 +203,18 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
/**
|
/**
|
||||||
* Is it okay to skip the given position?
|
* Is it okay to skip the given position?
|
||||||
*
|
*
|
||||||
* @param thisPos this position
|
* @param startingStates state information for this position
|
||||||
* @param lastPosRun the last position for which we created a VariantContext
|
* @param previousState state information for the last position for which we created a VariantContext
|
||||||
* @return true if it is okay to skip this position, false otherwise
|
* @return true if it is okay to skip this position, false otherwise
|
||||||
*/
|
*/
|
||||||
private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) {
|
private boolean okayToSkipThisSite(final PositionalState startingStates, final OverallState previousState) {
|
||||||
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1;
|
final int thisPos = startingStates.loc.getStart();
|
||||||
|
final GenomeLoc lastPosRun = previousState.prevPos;
|
||||||
|
Set<String> intersection = new HashSet<String>(startingStates.samples);
|
||||||
|
intersection.retainAll(previousState.samples);
|
||||||
|
|
||||||
|
//if there's a starting VC with a sample that's already in a current VC, don't skip this position
|
||||||
|
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -237,9 +252,15 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
*
|
*
|
||||||
* @param state the state with list of VariantContexts
|
* @param state the state with list of VariantContexts
|
||||||
* @param pos the target ending position
|
* @param pos the target ending position
|
||||||
* @param refBase the reference base to use at the position AFTER pos
|
* @param startingStates the state for the starting VCs
|
||||||
*/
|
*/
|
||||||
private void endPreviousStates(final OverallState state, final int pos, final byte refBase) {
|
private void endPreviousStates(final OverallState state, final int pos, final PositionalState startingStates, boolean atCurrentPosition) {
|
||||||
|
|
||||||
|
final byte refBase;
|
||||||
|
if (atCurrentPosition)
|
||||||
|
refBase = startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N';
|
||||||
|
else
|
||||||
|
refBase = startingStates.refBases[0];
|
||||||
|
|
||||||
final List<VariantContext> stoppedVCs = new ArrayList<>(state.VCs.size());
|
final List<VariantContext> stoppedVCs = new ArrayList<>(state.VCs.size());
|
||||||
|
|
||||||
|
|
@ -250,12 +271,24 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
||||||
stoppedVCs.add(vc);
|
stoppedVCs.add(vc);
|
||||||
|
|
||||||
// if it was ending anyways, then remove it from the future state
|
// if it was ending anyways, then remove it from the future state
|
||||||
if ( isEndingContext(vc, pos) )
|
if ( vc.getEnd() == pos) {
|
||||||
|
state.samples.removeAll(vc.getSampleNames());
|
||||||
|
state.VCs.remove(i);
|
||||||
|
continue; //don't try to remove twice
|
||||||
|
}
|
||||||
|
|
||||||
|
//if ending vc is the same sample as a starting VC, then remove it from the future state
|
||||||
|
if(startingStates.VCs.size() > 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) {
|
||||||
|
state.samples.removeAll(vc.getSampleNames());
|
||||||
state.VCs.remove(i);
|
state.VCs.remove(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ( !stoppedVCs.isEmpty() ) {
|
//if we already output this state in BP resolution mode (reflected by state.prevPos) then skip output
|
||||||
|
GenomeLoc lastWriteLoc = state.prevPos;
|
||||||
|
final int lastWritePos = lastWriteLoc == null ? -1 : lastWriteLoc.getStart();
|
||||||
|
if ( !stoppedVCs.isEmpty() && pos > lastWritePos) {
|
||||||
final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos);
|
final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos);
|
||||||
|
|
||||||
// we need the specialized merge if the site contains anything other than ref blocks
|
// we need the specialized merge if the site contains anything other than ref blocks
|
||||||
|
|
|
||||||
|
|
@ -315,7 +315,7 @@ public class ReferenceConfidenceVariantContextMerger {
|
||||||
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
|
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
|
||||||
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
|
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
|
||||||
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
|
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
|
||||||
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart());
|
int[] perSampleIndexesOfRelevantAlleles;
|
||||||
|
|
||||||
for ( final Genotype g : VC.getGenotypes() ) {
|
for ( final Genotype g : VC.getGenotypes() ) {
|
||||||
final String name = g.getSampleName();
|
final String name = g.getSampleName();
|
||||||
|
|
@ -325,11 +325,12 @@ public class ReferenceConfidenceVariantContextMerger {
|
||||||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
|
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
|
||||||
if (g.hasPL()) {
|
if (g.hasPL()) {
|
||||||
// lazy initialization of the genotype index map by ploidy.
|
// lazy initialization of the genotype index map by ploidy.
|
||||||
|
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g);
|
||||||
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
|
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
|
||||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles)
|
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
|
||||||
: genotypeIndexMapsByPloidy[ploidy];
|
: genotypeIndexMapsByPloidy[ploidy];
|
||||||
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
|
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
|
||||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null;
|
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
|
||||||
genotypeBuilder.PL(PLs).AD(AD).noGQ();
|
genotypeBuilder.PL(PLs).AD(AD).noGQ();
|
||||||
}
|
}
|
||||||
mergedGenotypes.add(genotypeBuilder.make());
|
mergedGenotypes.add(genotypeBuilder.make());
|
||||||
|
|
@ -364,27 +365,55 @@ public class ReferenceConfidenceVariantContextMerger {
|
||||||
*
|
*
|
||||||
* @param remappedAlleles the list of alleles to evaluate
|
* @param remappedAlleles the list of alleles to evaluate
|
||||||
* @param targetAlleles the target list of alleles
|
* @param targetAlleles the target list of alleles
|
||||||
* @param position position to use for error messages
|
* @param position position to output error info
|
||||||
|
* @param g genotype from which targetAlleles are derived
|
||||||
* @return non-null array of ints representing indexes
|
* @return non-null array of ints representing indexes
|
||||||
*/
|
*/
|
||||||
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position) {
|
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position, final Genotype g) {
|
||||||
|
|
||||||
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
|
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
|
||||||
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
|
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
|
||||||
|
|
||||||
if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
|
if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
|
||||||
throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
|
throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
|
||||||
final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
|
||||||
|
final int indexOfNonRef = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||||
|
|
||||||
|
//if the refs don't match then let the non-ref allele be the most likely of the alts
|
||||||
|
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
|
||||||
|
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
|
||||||
|
final int indexOfBestAlt;
|
||||||
|
if (!refsMatch && g.hasPL()) {
|
||||||
|
final int[] PLs = g.getPL().clone();
|
||||||
|
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
|
||||||
|
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
|
||||||
|
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
|
||||||
|
indexOfBestAlt = pair.alleleIndex2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
indexOfBestAlt = indexOfNonRef;
|
||||||
|
|
||||||
final int[] indexMapping = new int[targetAlleles.size()];
|
final int[] indexMapping = new int[targetAlleles.size()];
|
||||||
|
|
||||||
// the reference alleles always match up (even if they don't appear to)
|
// the reference likelihoods should always map to each other (even if the alleles don't)
|
||||||
indexMapping[0] = 0;
|
indexMapping[0] = 0;
|
||||||
|
|
||||||
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
|
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
|
||||||
for ( int i = 1; i < targetAlleles.size(); i++ ) {
|
final int targetNonRef = targetAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||||
|
final boolean targetHasNonRef = targetNonRef != -1;
|
||||||
|
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
|
||||||
|
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
|
||||||
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
|
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
|
||||||
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele;
|
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||||
|
}
|
||||||
|
if (targetHasNonRef) {
|
||||||
|
if (refsMatch) {
|
||||||
|
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
|
||||||
|
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
indexMapping[targetNonRef] = indexOfBestAlt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return indexMapping;
|
return indexMapping;
|
||||||
|
|
|
||||||
|
|
@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testBasepairResolutionInput() throws Exception {
|
public void testBasepairResolutionInput() throws Exception {
|
||||||
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
|
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
|
||||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15"));
|
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eb56760419b295651b6d54ba3ad18f52"));
|
||||||
spec.disableShadowBCF();
|
spec.disableShadowBCF();
|
||||||
executeTest("testBasepairResolutionInput", spec);
|
executeTest("testBasepairResolutionInput", spec);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -132,7 +132,8 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
||||||
alleles1.add(Allele.create("A", true));
|
alleles1.add(Allele.create("A", true));
|
||||||
final List<Allele> alleles2 = new ArrayList<>(1);
|
final List<Allele> alleles2 = new ArrayList<>(1);
|
||||||
alleles2.add(Allele.create("A", true));
|
alleles2.add(Allele.create("A", true));
|
||||||
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
|
GenotypeBuilder builder = new GenotypeBuilder();
|
||||||
|
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make());
|
||||||
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
|
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -147,7 +148,9 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
||||||
if ( allelesIndex > 0 )
|
if ( allelesIndex > 0 )
|
||||||
myAlleles.add(allAlleles.get(allelesIndex));
|
myAlleles.add(allAlleles.get(allelesIndex));
|
||||||
|
|
||||||
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
|
GenotypeBuilder builder = new GenotypeBuilder();
|
||||||
|
|
||||||
|
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make());
|
||||||
|
|
||||||
Assert.assertEquals(indexes.length, allAlleles.size());
|
Assert.assertEquals(indexes.length, allAlleles.size());
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue