Merge pull request #1263 from broadinstitute/rhl_rbp_not_merging_snps

Merge consecutive SNPs on the same read
This commit is contained in:
Ron Levine 2016-01-04 15:37:04 -05:00
commit 37746e40f4
4 changed files with 136 additions and 74 deletions

View File

@ -363,7 +363,7 @@ class PhasingUtils {
*
* @param gt1 genotype 1
* @param gt2 genotype 2
* @return true if genotypes have the same number of chromosomes, haplotype, phased, number of attributes
* @return true if genotypes have the same number of chromosomes, haplotype, number of attributes
* as chromosomes, and either genotype is homozygous, false otherwise
*/
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
@ -371,10 +371,6 @@ class PhasingUtils {
if (gt1.getPloidy() != gt2.getPloidy())
return false;
// If gt1 or gt2 are not phased, then can not be merged.
if (!gt1.isPhased() || !gt2.isPhased())
return false;
// If gt1 or gt2 are homozygous, then could be merged.
if (gt1.isHom() || gt2.isHom())
return true;

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.phasing;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Hidden;
@ -63,6 +64,7 @@ import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.filters.MappingQualityZeroFilter;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
@ -338,6 +340,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
VariantAndReads prevVr = null;
while (!unphasedSiteQueue.isEmpty()) {
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
@ -354,10 +357,35 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
oldPhasedList.addAll(discardIrrelevantPhasedSites());
if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList));
VariantAndReads vr = unphasedSiteQueue.remove();
final VariantAndReads vr = unphasedSiteQueue.remove();
// should try to merge variants if they are SNPs that are within the minimum merging distance from each other
final boolean shouldTryToMerge = enableMergePhasedSegregatingPolymorphismsToMNP &&
prevVr != null &&
prevVr.variant.isSNP() && vr.variant.isSNP() &&
vr.variant.getStart() - prevVr.variant.getStart() <= maxGenomicDistanceForMNP;
// if should try to merge, find if there are any reads that contain both SNPs
boolean commonReads = false;
if ( shouldTryToMerge ) {
for ( final String readName : vr.variantReadNames) {
if (prevVr.variantReadNames.contains(readName)) {
commonReads = true;
break;
}
}
if ( DEBUG && !commonReads )
logger.debug("No common reads with previous variant for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
}
if (DEBUG)
logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
phaseSite(vr, phaseStats);
// phase the variant site, cannot phase if trying to merge variants that do not have any common reads
phaseSite(vr, phaseStats, !(shouldTryToMerge && !commonReads));
// save previous variant reads for next iteration
prevVr = vr;
}
// Update partiallyPhasedSites after phaseSite is done:
@ -399,7 +427,14 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue).
*/
private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) {
/**
* Phase the variant site relative to the previous site
*
* @param vr A variant and the reads for each sample at that site:
* @param phaseStats Summary statistics about phasing rates for each sample
* @param canPhase Can phase variant site relative to the previous site
*/
private void phaseSite(final VariantAndReads vr, final PhasingStats phaseStats, final boolean canPhase) {
VariantContext vc = vr.variant;
logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
@ -433,76 +468,78 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
boolean phasedCurGenotypeRelativeToPrevious = false;
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
if ( canPhase ) {
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
PhaseResult pr = phaseSampleAtSite(phaseWindow);
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
PhaseResult pr = phaseSampleAtSite(phaseWindow);
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
if (pr.phasingContainsInconsistencies) {
if (DEBUG)
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
uvc.setPhasingInconsistent();
}
if (phasedCurGenotypeRelativeToPrevious) {
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
if (!prevHetGenotype.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY))
throw new ReviewedGATKException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY);
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
Genotype phasedGt = new GenotypeBuilder(gt)
.alleles(allelePair.getAllelesAsList())
.attribute(VCFConstants.PHASE_QUALITY_KEY, pr.phaseQuality)
.attribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY, curPairNames)
.make();
uvc.setGenotype(samp, phasedGt);
if (DEBUG) {
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
logger.debug("\n");
if (pr.phasingContainsInconsistencies) {
if (DEBUG)
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
uvc.setPhasingInconsistent();
}
}
if (statsWriter != null) {
GenomeLoc prevLoc = null;
int curIndex = 0;
for (GenotypeAndReadBases grb : prevHetGenotypes) {
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
prevLoc = grb.loc;
break;
if (phasedCurGenotypeRelativeToPrevious) {
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
if (!prevHetGenotype.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY))
throw new ReviewedGATKException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY);
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
Genotype phasedGt = new GenotypeBuilder(gt)
.alleles(allelePair.getAllelesAsList())
.attribute(VCFConstants.PHASE_QUALITY_KEY, pr.phaseQuality)
.attribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY, curPairNames)
.make();
uvc.setGenotype(samp, phasedGt);
if (DEBUG) {
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
logger.debug("\n");
}
++curIndex;
}
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
}
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
if (statsWriter != null) {
GenomeLoc prevLoc = null;
int curIndex = 0;
for (GenotypeAndReadBases grb : prevHetGenotypes) {
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
prevLoc = grb.loc;
break;
}
++curIndex;
}
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
}
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
if (pr.phasingContainsInconsistencies) {
if (phasedCurGenotypeRelativeToPrevious)
sampPhaseCounts.numInconsistentSitesPhased++;
else
sampPhaseCounts.numInconsistentSitesNotPhased++;
}
if (pr.phasingContainsInconsistencies) {
if (phasedCurGenotypeRelativeToPrevious)
sampPhaseCounts.numInconsistentSitesPhased++;
else
sampPhaseCounts.numInconsistentSitesNotPhased++;
sampPhaseCounts.numPhased++;
// Phased current relative to *SOME* previous het genotype, so break out of loop:
if (phasedCurGenotypeRelativeToPrevious)
break;
}
if (phasedCurGenotypeRelativeToPrevious)
sampPhaseCounts.numPhased++;
// Phased current relative to *SOME* previous het genotype, so break out of loop:
if (phasedCurGenotypeRelativeToPrevious)
break;
}
if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het:
@ -1206,6 +1243,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private class VariantAndReads {
public VariantContext variant;
public HashMap<String, ReadBasesAtPosition> sampleReadBases;
public Set<String> variantReadNames;
public VariantAndReads(VariantContext variant, HashMap<String, ReadBasesAtPosition> sampleReadBases) {
this.variant = variant;
@ -1232,6 +1270,22 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
sampleReadBases.put(sample, readBases);
}
}
// if merging SNPs, save the read names overlapping the variant
if (enableMergePhasedSegregatingPolymorphismsToMNP && variant.isSNP()) {
variantReadNames = new HashSet<>();
for ( final GATKSAMRecord read : pileup.getReads() ) {
// get the SNP position in the read
Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(read, variant.getStart());
// get the reads containing the SNP
for (final Allele altAllele : variant.getAlternateAlleles()) {
if (read.getReadBases()[pair.first] == altAllele.getBases()[0]) {
variantReadNames.add(read.getReadName());
}
}
}
}
}
}
}

View File

@ -81,7 +81,6 @@ public class PhasingUtilsUnitTest extends BaseTest {
private ReferenceSequenceFile referenceFile;
private Genotype genotype1;
private Genotype genotype2;
private Genotype genotype2unphased;
private String contig;
private List<Allele> alleleList1;
private List<Allele> alleleList2;
@ -94,9 +93,8 @@ public class PhasingUtilsUnitTest extends BaseTest {
genomeLocParser = new GenomeLocParser(referenceFile);
alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false));
alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false));
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).phased(true).make();
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(true).make();
genotype2unphased = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(false).make();
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make();
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make();
contig = new String("1");
vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make();
vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make();
@ -186,7 +184,6 @@ public class PhasingUtilsUnitTest extends BaseTest {
@Test
public void TestAlleleSegregationIsKnown(){
Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2));
Assert.assertFalse(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2unphased));
}
@Test

View File

@ -152,7 +152,22 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
" -o %s" +
" --no_cmdline_in_header",
1,
Arrays.asList("d7797171d9ca4e173fab6b5af1e6d539"));
Arrays.asList("b251b4378fda9784f2175c7e3d80f032"));
executeTest("Do not merge unphased SNPs", spec);
}
@Test
public void testMergeSNPsIfSameRead() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ReadBackedPhasing" +
" -R " + b37KGReferenceWithDecoy +
" -I " + privateTestDir + "readBackedPhasing.bam" +
" --variant " + privateTestDir + "readBackedPhasing.vcf.gz" +
" -enableMergeToMNP -maxDistMNP 2 -L 1:1875000-1877000" +
" -o %s" +
" --no_cmdline_in_header",
1,
Arrays.asList("630816da701b9ea8674c23c91fa61bec"));
executeTest("Merge SNPs if on the same read", spec);
}
}