Merge pull request #763 from broadinstitute/ldg_combineGVCFsFix

Fix bug in CombineGVCFs so now sample 2 variants occurring within sample ...
This commit is contained in:
rpoplin 2014-11-06 12:56:15 -05:00
commit 31cb47b9e6
4 changed files with 88 additions and 23 deletions

View File

@ -112,10 +112,14 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
protected final class PositionalState {
final List<VariantContext> VCs;
final Set<String> samples = new HashSet<>();
final byte[] refBases;
final GenomeLoc loc;
public PositionalState(final List<VariantContext> VCs, final byte[] refBases, final GenomeLoc loc) {
this.VCs = VCs;
for(final VariantContext vc : VCs){
samples.addAll(vc.getSampleNames());
}
this.refBases = refBases;
this.loc = loc;
}
@ -123,6 +127,7 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
protected final class OverallState {
final LinkedList<VariantContext> VCs = new LinkedList<>();
final Set<String> samples = new HashSet<>();
GenomeLoc prevPos = null;
byte refAfterPrevPos;
@ -179,13 +184,17 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
final int currentPos = startingStates.loc.getStart();
if ( !startingStates.VCs.isEmpty() ) {
if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) )
endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]);
if ( ! okayToSkipThisSite(startingStates, previousState) )
endPreviousStates(previousState, currentPos - 1, startingStates, false);
previousState.VCs.addAll(startingStates.VCs);
for(final VariantContext vc : previousState.VCs){
previousState.samples.addAll(vc.getSampleNames());
}
}
if ( USE_BP_RESOLUTION || containsEndingContext(previousState.VCs, currentPos) ) {
endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N');
endPreviousStates(previousState, currentPos, startingStates, true);
}
return previousState;
@ -194,12 +203,18 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
/**
* Is it okay to skip the given position?
*
* @param thisPos this position
* @param lastPosRun the last position for which we created a VariantContext
* @param startingStates state information for this position
* @param previousState state information for the last position for which we created a VariantContext
* @return true if it is okay to skip this position, false otherwise
*/
private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) {
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1;
private boolean okayToSkipThisSite(final PositionalState startingStates, final OverallState previousState) {
final int thisPos = startingStates.loc.getStart();
final GenomeLoc lastPosRun = previousState.prevPos;
Set<String> intersection = new HashSet<String>(startingStates.samples);
intersection.retainAll(previousState.samples);
//if there's a starting VC with a sample that's already in a current VC, don't skip this position
return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty();
}
/**
@ -237,9 +252,15 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
*
* @param state the state with list of VariantContexts
* @param pos the target ending position
* @param refBase the reference base to use at the position AFTER pos
* @param startingStates the state for the starting VCs
*/
private void endPreviousStates(final OverallState state, final int pos, final byte refBase) {
private void endPreviousStates(final OverallState state, final int pos, final PositionalState startingStates, boolean atCurrentPosition) {
final byte refBase;
if (atCurrentPosition)
refBase = startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N';
else
refBase = startingStates.refBases[0];
final List<VariantContext> stoppedVCs = new ArrayList<>(state.VCs.size());
@ -250,12 +271,24 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
stoppedVCs.add(vc);
// if it was ending anyways, then remove it from the future state
if ( isEndingContext(vc, pos) )
if ( vc.getEnd() == pos) {
state.samples.removeAll(vc.getSampleNames());
state.VCs.remove(i);
continue; //don't try to remove twice
}
//if ending vc is the same sample as a starting VC, then remove it from the future state
if(startingStates.VCs.size() > 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) {
state.samples.removeAll(vc.getSampleNames());
state.VCs.remove(i);
}
}
}
if ( !stoppedVCs.isEmpty() ) {
//if we already output this state in BP resolution mode (reflected by state.prevPos) then skip output
GenomeLoc lastWriteLoc = state.prevPos;
final int lastWritePos = lastWriteLoc == null ? -1 : lastWriteLoc.getStart();
if ( !stoppedVCs.isEmpty() && pos > lastWritePos) {
final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos);
// we need the specialized merge if the site contains anything other than ref blocks

View File

@ -315,7 +315,7 @@ public class ReferenceConfidenceVariantContextMerger {
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart());
int[] perSampleIndexesOfRelevantAlleles;
for ( final Genotype g : VC.getGenotypes() ) {
final String name = g.getSampleName();
@ -325,11 +325,12 @@ public class ReferenceConfidenceVariantContextMerger {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
if (g.hasPL()) {
// lazy initialization of the genotype index map by ploidy.
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g);
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles)
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
: genotypeIndexMapsByPloidy[ploidy];
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null;
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
genotypeBuilder.PL(PLs).AD(AD).noGQ();
}
mergedGenotypes.add(genotypeBuilder.make());
@ -364,27 +365,55 @@ public class ReferenceConfidenceVariantContextMerger {
*
* @param remappedAlleles the list of alleles to evaluate
* @param targetAlleles the target list of alleles
* @param position position to use for error messages
* @param position position to output error info
* @param g genotype from which targetAlleles are derived
* @return non-null array of ints representing indexes
*/
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position) {
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position, final Genotype g) {
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final int indexOfNonRef = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
//if the refs don't match then let the non-ref allele be the most likely of the alts
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
final int indexOfBestAlt;
if (!refsMatch && g.hasPL()) {
final int[] PLs = g.getPL().clone();
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
indexOfBestAlt = pair.alleleIndex2;
}
else
indexOfBestAlt = indexOfNonRef;
final int[] indexMapping = new int[targetAlleles.size()];
// the reference alleles always match up (even if they don't appear to)
// the reference likelihoods should always map to each other (even if the alleles don't)
indexMapping[0] = 0;
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int targetNonRef = targetAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final boolean targetHasNonRef = targetNonRef != -1;
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele;
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
if (targetHasNonRef) {
if (refsMatch) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
else {
indexMapping[targetNonRef] = indexOfBestAlt;
}
}
return indexMapping;

View File

@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionInput() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eb56760419b295651b6d54ba3ad18f52"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionInput", spec);
}

View File

@ -132,7 +132,8 @@ public class VariantContextMergerUnitTest extends BaseTest {
alleles1.add(Allele.create("A", true));
final List<Allele> alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
GenotypeBuilder builder = new GenotypeBuilder();
ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make());
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
}
@ -147,7 +148,9 @@ public class VariantContextMergerUnitTest extends BaseTest {
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
GenotypeBuilder builder = new GenotypeBuilder();
final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make());
Assert.assertEquals(indexes.length, allAlleles.size());