Merge pull request #1263 from broadinstitute/rhl_rbp_not_merging_snps
Merge consecutive SNPs on the same read
This commit is contained in:
commit
37746e40f4
|
|
@ -363,7 +363,7 @@ class PhasingUtils {
|
|||
*
|
||||
* @param gt1 genotype 1
|
||||
* @param gt2 genotype 2
|
||||
* @return true if genotypes have the same number of chromosomes, haplotype, phased, number of attributes
|
||||
* @return true if genotypes have the same number of chromosomes, haplotype, number of attributes
|
||||
* as chromosomes, and either genotype is homozygous, false otherwise
|
||||
*/
|
||||
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
|
||||
|
|
@ -371,10 +371,6 @@ class PhasingUtils {
|
|||
if (gt1.getPloidy() != gt2.getPloidy())
|
||||
return false;
|
||||
|
||||
// If gt1 or gt2 are not phased, then can not be merged.
|
||||
if (!gt1.isPhased() || !gt2.isPhased())
|
||||
return false;
|
||||
|
||||
// If gt1 or gt2 are homozygous, then could be merged.
|
||||
if (gt1.isHom() || gt2.isHom())
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@
|
|||
package org.broadinstitute.gatk.tools.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.gatk.engine.walkers.*;
|
||||
import org.broadinstitute.gatk.utils.collections.Pair;
|
||||
import org.broadinstitute.gatk.utils.commandline.Argument;
|
||||
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.gatk.utils.commandline.Hidden;
|
||||
|
|
@ -63,6 +64,7 @@ import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
|||
import org.broadinstitute.gatk.engine.filters.MappingQualityZeroFilter;
|
||||
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.gatk.utils.help.HelpConstants;
|
||||
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.gatk.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.gatk.engine.GATKVCFUtils;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
|
|
@ -338,6 +340,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
|
||||
List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
|
||||
|
||||
VariantAndReads prevVr = null;
|
||||
while (!unphasedSiteQueue.isEmpty()) {
|
||||
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
|
||||
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
|
||||
|
|
@ -354,10 +357,35 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
oldPhasedList.addAll(discardIrrelevantPhasedSites());
|
||||
if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList));
|
||||
|
||||
VariantAndReads vr = unphasedSiteQueue.remove();
|
||||
final VariantAndReads vr = unphasedSiteQueue.remove();
|
||||
|
||||
// should try to merge variants if they are SNPs that are within the minimum merging distance from each other
|
||||
final boolean shouldTryToMerge = enableMergePhasedSegregatingPolymorphismsToMNP &&
|
||||
prevVr != null &&
|
||||
prevVr.variant.isSNP() && vr.variant.isSNP() &&
|
||||
vr.variant.getStart() - prevVr.variant.getStart() <= maxGenomicDistanceForMNP;
|
||||
|
||||
// if should try to merge, find if there are any reads that contain both SNPs
|
||||
boolean commonReads = false;
|
||||
if ( shouldTryToMerge ) {
|
||||
for ( final String readName : vr.variantReadNames) {
|
||||
if (prevVr.variantReadNames.contains(readName)) {
|
||||
commonReads = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( DEBUG && !commonReads )
|
||||
logger.debug("No common reads with previous variant for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
|
||||
}
|
||||
|
||||
if (DEBUG)
|
||||
logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
|
||||
phaseSite(vr, phaseStats);
|
||||
|
||||
// phase the variant site, cannot phase if trying to merge variants that do not have any common reads
|
||||
phaseSite(vr, phaseStats, !(shouldTryToMerge && !commonReads));
|
||||
|
||||
// save previous variant reads for next iteration
|
||||
prevVr = vr;
|
||||
}
|
||||
|
||||
// Update partiallyPhasedSites after phaseSite is done:
|
||||
|
|
@ -399,7 +427,14 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue).
|
||||
*/
|
||||
|
||||
private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) {
|
||||
/**
|
||||
* Phase the variant site relative to the previous site
|
||||
*
|
||||
* @param vr A variant and the reads for each sample at that site:
|
||||
* @param phaseStats Summary statistics about phasing rates for each sample
|
||||
* @param canPhase Can phase variant site relative to the previous site
|
||||
*/
|
||||
private void phaseSite(final VariantAndReads vr, final PhasingStats phaseStats, final boolean canPhase) {
|
||||
VariantContext vc = vr.variant;
|
||||
logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
|
||||
|
|
@ -433,76 +468,78 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
|
||||
|
||||
boolean phasedCurGenotypeRelativeToPrevious = false;
|
||||
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
|
||||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
|
||||
if ( canPhase ) {
|
||||
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
|
||||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
|
||||
|
||||
PhaseResult pr = phaseSampleAtSite(phaseWindow);
|
||||
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
|
||||
PhaseResult pr = phaseSampleAtSite(phaseWindow);
|
||||
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (DEBUG)
|
||||
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
uvc.setPhasingInconsistent();
|
||||
}
|
||||
|
||||
if (phasedCurGenotypeRelativeToPrevious) {
|
||||
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
|
||||
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
|
||||
if (!prevHetGenotype.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY))
|
||||
throw new ReviewedGATKException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
|
||||
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY);
|
||||
|
||||
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
|
||||
Genotype phasedGt = new GenotypeBuilder(gt)
|
||||
.alleles(allelePair.getAllelesAsList())
|
||||
.attribute(VCFConstants.PHASE_QUALITY_KEY, pr.phaseQuality)
|
||||
.attribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY, curPairNames)
|
||||
.make();
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
|
||||
if (DEBUG) {
|
||||
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
|
||||
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
|
||||
|
||||
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
|
||||
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
|
||||
logger.debug("\n");
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (DEBUG)
|
||||
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
uvc.setPhasingInconsistent();
|
||||
}
|
||||
}
|
||||
|
||||
if (statsWriter != null) {
|
||||
GenomeLoc prevLoc = null;
|
||||
int curIndex = 0;
|
||||
for (GenotypeAndReadBases grb : prevHetGenotypes) {
|
||||
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
|
||||
prevLoc = grb.loc;
|
||||
break;
|
||||
if (phasedCurGenotypeRelativeToPrevious) {
|
||||
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
|
||||
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
|
||||
if (!prevHetGenotype.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY))
|
||||
throw new ReviewedGATKException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
|
||||
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY);
|
||||
|
||||
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
|
||||
Genotype phasedGt = new GenotypeBuilder(gt)
|
||||
.alleles(allelePair.getAllelesAsList())
|
||||
.attribute(VCFConstants.PHASE_QUALITY_KEY, pr.phaseQuality)
|
||||
.attribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY, curPairNames)
|
||||
.make();
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
|
||||
if (DEBUG) {
|
||||
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
|
||||
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
|
||||
|
||||
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
|
||||
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
|
||||
logger.debug("\n");
|
||||
}
|
||||
++curIndex;
|
||||
}
|
||||
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
}
|
||||
|
||||
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
|
||||
if (sampPhaseCounts == null) {
|
||||
sampPhaseCounts = new PhaseCounts();
|
||||
samplePhaseStats.put(samp, sampPhaseCounts);
|
||||
}
|
||||
sampPhaseCounts.numTestedSites++;
|
||||
if (statsWriter != null) {
|
||||
GenomeLoc prevLoc = null;
|
||||
int curIndex = 0;
|
||||
for (GenotypeAndReadBases grb : prevHetGenotypes) {
|
||||
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
|
||||
prevLoc = grb.loc;
|
||||
break;
|
||||
}
|
||||
++curIndex;
|
||||
}
|
||||
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
}
|
||||
|
||||
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
|
||||
if (sampPhaseCounts == null) {
|
||||
sampPhaseCounts = new PhaseCounts();
|
||||
samplePhaseStats.put(samp, sampPhaseCounts);
|
||||
}
|
||||
sampPhaseCounts.numTestedSites++;
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
sampPhaseCounts.numInconsistentSitesPhased++;
|
||||
else
|
||||
sampPhaseCounts.numInconsistentSitesNotPhased++;
|
||||
}
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
sampPhaseCounts.numInconsistentSitesPhased++;
|
||||
else
|
||||
sampPhaseCounts.numInconsistentSitesNotPhased++;
|
||||
sampPhaseCounts.numPhased++;
|
||||
|
||||
// Phased current relative to *SOME* previous het genotype, so break out of loop:
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
break;
|
||||
}
|
||||
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
sampPhaseCounts.numPhased++;
|
||||
|
||||
// Phased current relative to *SOME* previous het genotype, so break out of loop:
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het:
|
||||
|
|
@ -1206,6 +1243,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
private class VariantAndReads {
|
||||
public VariantContext variant;
|
||||
public HashMap<String, ReadBasesAtPosition> sampleReadBases;
|
||||
public Set<String> variantReadNames;
|
||||
|
||||
public VariantAndReads(VariantContext variant, HashMap<String, ReadBasesAtPosition> sampleReadBases) {
|
||||
this.variant = variant;
|
||||
|
|
@ -1232,6 +1270,22 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
sampleReadBases.put(sample, readBases);
|
||||
}
|
||||
}
|
||||
|
||||
// if merging SNPs, save the read names overlapping the variant
|
||||
if (enableMergePhasedSegregatingPolymorphismsToMNP && variant.isSNP()) {
|
||||
variantReadNames = new HashSet<>();
|
||||
for ( final GATKSAMRecord read : pileup.getReads() ) {
|
||||
// get the SNP position in the read
|
||||
Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(read, variant.getStart());
|
||||
|
||||
// get the reads containing the SNP
|
||||
for (final Allele altAllele : variant.getAlternateAlleles()) {
|
||||
if (read.getReadBases()[pair.first] == altAllele.getBases()[0]) {
|
||||
variantReadNames.add(read.getReadName());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -81,7 +81,6 @@ public class PhasingUtilsUnitTest extends BaseTest {
|
|||
private ReferenceSequenceFile referenceFile;
|
||||
private Genotype genotype1;
|
||||
private Genotype genotype2;
|
||||
private Genotype genotype2unphased;
|
||||
private String contig;
|
||||
private List<Allele> alleleList1;
|
||||
private List<Allele> alleleList2;
|
||||
|
|
@ -94,9 +93,8 @@ public class PhasingUtilsUnitTest extends BaseTest {
|
|||
genomeLocParser = new GenomeLocParser(referenceFile);
|
||||
alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false));
|
||||
alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false));
|
||||
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).phased(true).make();
|
||||
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(true).make();
|
||||
genotype2unphased = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(false).make();
|
||||
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make();
|
||||
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make();
|
||||
contig = new String("1");
|
||||
vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make();
|
||||
vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make();
|
||||
|
|
@ -186,7 +184,6 @@ public class PhasingUtilsUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void TestAlleleSegregationIsKnown(){
|
||||
Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2));
|
||||
Assert.assertFalse(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2unphased));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -152,7 +152,22 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("d7797171d9ca4e173fab6b5af1e6d539"));
|
||||
Arrays.asList("b251b4378fda9784f2175c7e3d80f032"));
|
||||
executeTest("Do not merge unphased SNPs", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeSNPsIfSameRead() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T ReadBackedPhasing" +
|
||||
" -R " + b37KGReferenceWithDecoy +
|
||||
" -I " + privateTestDir + "readBackedPhasing.bam" +
|
||||
" --variant " + privateTestDir + "readBackedPhasing.vcf.gz" +
|
||||
" -enableMergeToMNP -maxDistMNP 2 -L 1:1875000-1877000" +
|
||||
" -o %s" +
|
||||
" --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("630816da701b9ea8674c23c91fa61bec"));
|
||||
executeTest("Merge SNPs if on the same read", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue