Updated MNP merging to merge VC records if any sample has a haplotype of ALT-ALT, since this could possibly change annotations. Note that, besides the "interesting" case of an ALT-ALT MNP in a pair of HET sites, this could even occur if two records are hom-var (irrespective of using phasing). Note also that this procedure may generate more than one ALT allele.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4577 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6d7ed5781c
commit
a7af1a164b
|
|
@ -761,45 +761,22 @@ public class VariantContextUtils {
|
||||||
if (!mergeIntoMNPvalidationCheck(vc1, vc2))
|
if (!mergeIntoMNPvalidationCheck(vc1, vc2))
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
Map<Allele, Allele> allele1ToAllele2 = calcMergeAllele1WithAllele2Map(vc1, vc2);
|
// Check that it's logically possible to merge the VCs, and that there's a point in doing so (e.g., annotations could be changed):
|
||||||
if (allele1ToAllele2 == null)
|
if (!allSamplesAreMergeable(vc1, vc2) || !someSampleHasDoubleNonReferenceAllele(vc1, vc2))
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
return reallyMergeIntoMNP(vc1, vc2, referenceFile, allele1ToAllele2);
|
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, Map<Allele, Allele> allele1ToAllele2) {
|
private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
|
||||||
int startInter = vc1.getEnd() + 1;
|
int startInter = vc1.getEnd() + 1;
|
||||||
int endInter = vc2.getStart() - 1;
|
int endInter = vc2.getStart() - 1;
|
||||||
byte[] intermediateBases = null;
|
byte[] intermediateBases = null;
|
||||||
int intermediateLength = 0;
|
|
||||||
if (startInter <= endInter) {
|
if (startInter <= endInter) {
|
||||||
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
|
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
|
||||||
StringUtil.toUpperCase(intermediateBases);
|
StringUtil.toUpperCase(intermediateBases);
|
||||||
intermediateLength = intermediateBases.length;
|
|
||||||
}
|
}
|
||||||
|
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
|
||||||
Map<Allele, Allele> allele1ToMergedAllele = new HashMap<Allele, Allele>();
|
|
||||||
for (Map.Entry<Allele, Allele> all1all2Entry : allele1ToAllele2.entrySet()) {
|
|
||||||
Allele all1 = all1all2Entry.getKey();
|
|
||||||
Allele all2 = all1all2Entry.getValue();
|
|
||||||
|
|
||||||
byte[] bases1 = all1.getBases();
|
|
||||||
byte[] bases2 = all2.getBases();
|
|
||||||
|
|
||||||
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
|
|
||||||
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
|
|
||||||
if (intermediateBases != null)
|
|
||||||
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
|
|
||||||
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
|
|
||||||
|
|
||||||
Allele mergedAllele = Allele.create(mergedBases, all1.equals(vc1.getReference()));
|
|
||||||
allele1ToMergedAllele.put(all1, mergedAllele);
|
|
||||||
}
|
|
||||||
|
|
||||||
Set<Allele> allMergedAlleles = new HashSet<Allele>();
|
|
||||||
Allele mergedRefAllele = allele1ToMergedAllele.get(vc1.getReference());
|
|
||||||
allMergedAlleles.add(mergedRefAllele); // ensure that the reference allele is added
|
|
||||||
|
|
||||||
Map<String, Genotype> mergedGenotypes = new HashMap<String, Genotype>();
|
Map<String, Genotype> mergedGenotypes = new HashMap<String, Genotype>();
|
||||||
for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
|
for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
|
||||||
|
|
@ -808,10 +785,15 @@ public class VariantContextUtils {
|
||||||
Genotype gt2 = vc2.getGenotype(sample);
|
Genotype gt2 = vc2.getGenotype(sample);
|
||||||
|
|
||||||
List<Allele> site1Alleles = gt1.getAlleles();
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
|
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
|
||||||
|
|
||||||
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
for (Allele all1 : site1Alleles) {
|
for (Allele all1 : site1Alleles) {
|
||||||
Allele mergedAllele = allele1ToMergedAllele.get(all1);
|
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||||
allMergedAlleles.add(mergedAllele);
|
|
||||||
|
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
|
||||||
mergedAllelesForSample.add(mergedAllele);
|
mergedAllelesForSample.add(mergedAllele);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -823,7 +805,7 @@ public class VariantContextUtils {
|
||||||
if (PQ != null)
|
if (PQ != null)
|
||||||
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, PQ);
|
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, PQ);
|
||||||
|
|
||||||
Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, gt1.genotypesArePhased());
|
Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, calcPhaseForMergedGenotypes(gt1, gt2));
|
||||||
mergedGenotypes.put(sample, mergedGt);
|
mergedGenotypes.put(sample, mergedGt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -834,7 +816,7 @@ public class VariantContextUtils {
|
||||||
if (mergedAttribs == null)
|
if (mergedAttribs == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), allMergedAlleles, mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs);
|
VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs);
|
||||||
|
|
||||||
/* Calculate VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY from scratch
|
/* Calculate VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY from scratch
|
||||||
[though technically they should already be consistent with each of vc1 and vc2's respective alleles]:
|
[though technically they should already be consistent with each of vc1 and vc2's respective alleles]:
|
||||||
|
|
@ -846,6 +828,71 @@ public class VariantContextUtils {
|
||||||
return mergedVc;
|
return mergedVc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class AlleleOneAndTwo {
|
||||||
|
private Allele all1;
|
||||||
|
private Allele all2;
|
||||||
|
|
||||||
|
public AlleleOneAndTwo(Allele all1, Allele all2) {
|
||||||
|
this.all1 = all1;
|
||||||
|
this.all2 = all2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return all1.hashCode() + all2.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (!(other instanceof AlleleOneAndTwo))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
AlleleOneAndTwo otherAot = (AlleleOneAndTwo)other;
|
||||||
|
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class MergedAllelesData {
|
||||||
|
private Map<AlleleOneAndTwo, Allele> mergedAlleles;
|
||||||
|
private byte[] intermediateBases;
|
||||||
|
private int intermediateLength;
|
||||||
|
|
||||||
|
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
|
||||||
|
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
|
||||||
|
this.intermediateBases = intermediateBases;
|
||||||
|
this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0;
|
||||||
|
|
||||||
|
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele ensureMergedAllele(Allele all1, Allele all2) {
|
||||||
|
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
|
||||||
|
}
|
||||||
|
|
||||||
|
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
|
||||||
|
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
|
||||||
|
Allele mergedAllele = mergedAlleles.get(all12);
|
||||||
|
|
||||||
|
if (mergedAllele == null) {
|
||||||
|
byte[] bases1 = all1.getBases();
|
||||||
|
byte[] bases2 = all2.getBases();
|
||||||
|
|
||||||
|
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
|
||||||
|
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
|
||||||
|
if (intermediateBases != null)
|
||||||
|
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
|
||||||
|
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
|
||||||
|
|
||||||
|
mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
|
||||||
|
mergedAlleles.put(all12, mergedAllele);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedAllele;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Allele> getAllMergedAlleles() {
|
||||||
|
return new HashSet<Allele>(mergedAlleles.values());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static String mergeVariantContextNames(String name1, String name2) {
|
private static String mergeVariantContextNames(String name1, String name2) {
|
||||||
return name1 + "_" + name2;
|
return name1 + "_" + name2;
|
||||||
}
|
}
|
||||||
|
|
@ -935,49 +982,76 @@ public class VariantContextUtils {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<Allele, Allele> calcMergeAllele1WithAllele2Map(VariantContext vc1, VariantContext vc2) {
|
// Assumes that vc1 and vc2 were already checked to have the same sample names:
|
||||||
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
|
private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
|
||||||
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
|
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
|
||||||
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
|
|
||||||
|
|
||||||
// Note the segregation of the alleles for the reference genome:
|
|
||||||
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
|
|
||||||
allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
|
|
||||||
|
|
||||||
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
|
|
||||||
// [Also, check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1.]
|
|
||||||
for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
|
for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
|
||||||
String sample = gt1Entry.getKey();
|
String sample = gt1Entry.getKey();
|
||||||
Genotype gt1 = gt1Entry.getValue();
|
Genotype gt1 = gt1Entry.getValue();
|
||||||
Genotype gt2 = vc2.getGenotype(sample);
|
Genotype gt2 = vc2.getGenotype(sample);
|
||||||
|
|
||||||
if (gt1.getPloidy() != gt2.getPloidy())
|
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
|
||||||
return null;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (!gt2.genotypesArePhased() && !gt2.isHom()) // since can merge if phased or even if an unphased hom
|
return true;
|
||||||
return null;
|
}
|
||||||
|
|
||||||
|
public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
|
||||||
|
if (gt1.getPloidy() != gt2.getPloidy())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
|
||||||
|
|
||||||
|
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
|
||||||
|
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
|
||||||
|
*/
|
||||||
|
return (gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes that alleleSegregationIsKnown(gt1, gt2):
|
||||||
|
private static boolean calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
|
||||||
|
if (gt2.genotypesArePhased() || gt2.isHom())
|
||||||
|
return gt1.genotypesArePhased(); // maintain the phase of gt1
|
||||||
|
|
||||||
|
if (!gt1.isHom())
|
||||||
|
throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
|
||||||
|
|
||||||
|
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
|
||||||
|
|
||||||
|
For example, if we're merging the third Genotype with the second one:
|
||||||
|
0/1
|
||||||
|
1|1
|
||||||
|
0/1
|
||||||
|
|
||||||
|
Then, we want to output:
|
||||||
|
0/1
|
||||||
|
1/2
|
||||||
|
*/
|
||||||
|
return false; // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Checks if any sample has a MNP of ALT alleles (segregating together):
|
||||||
|
[Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)]
|
||||||
|
*/
|
||||||
|
private static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
|
||||||
|
for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
|
||||||
|
String sample = gt1Entry.getKey();
|
||||||
|
Genotype gt1 = gt1Entry.getValue();
|
||||||
|
Genotype gt2 = vc2.getGenotype(sample);
|
||||||
|
|
||||||
List<Allele> site1Alleles = gt1.getAlleles();
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
List<Allele> site2Alleles = gt2.getAlleles();
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
Iterator<Allele> all2It = site2Alleles.iterator();
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
for (Allele all1 : site1Alleles) {
|
for (Allele all1 : site1Alleles) {
|
||||||
Allele all2 = all2It.next();
|
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||||
|
|
||||||
Allele all1To2 = allele1ToAllele2.get(all1);
|
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
|
||||||
if (all1To2 == null)
|
return true;
|
||||||
allele1ToAllele2.put(all1, all2);
|
|
||||||
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
|
|
||||||
return null;
|
|
||||||
|
|
||||||
Allele all2To1 = allele2ToAllele1.get(all2);
|
|
||||||
if (all2To1 == null)
|
|
||||||
allele2ToAllele1.put(all2, all1);
|
|
||||||
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return allele1ToAllele2;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,414 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broad.tribble.vcf.VCFHeader;
|
||||||
|
import org.broad.tribble.vcf.VCFWriter;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
|
||||||
|
|
||||||
|
public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
|
private VCFWriter innerWriter;
|
||||||
|
|
||||||
|
private ReferenceSequenceFile referenceFileForMNPmerging;
|
||||||
|
private int maxGenomicDistanceForMNP;
|
||||||
|
|
||||||
|
private VCFRecord vcfrWaitingToMerge;
|
||||||
|
private List<VCFRecord> filteredVcfrList;
|
||||||
|
|
||||||
|
private int numRecordsAttemptToMerge;
|
||||||
|
private int numRecordsWithinDistance;
|
||||||
|
private int numMergedRecords;
|
||||||
|
private AltAlleleStatsForSamples altAlleleStats = null;
|
||||||
|
|
||||||
|
private Logger logger;
|
||||||
|
|
||||||
|
// Should we call innerWriter.close() in close()
|
||||||
|
private boolean takeOwnershipOfInner;
|
||||||
|
|
||||||
|
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||||
|
this.innerWriter = innerWriter;
|
||||||
|
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
|
||||||
|
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
|
||||||
|
this.vcfrWaitingToMerge = null;
|
||||||
|
this.filteredVcfrList = new LinkedList<VCFRecord>();
|
||||||
|
this.numRecordsWithinDistance = 0;
|
||||||
|
this.numMergedRecords = 0;
|
||||||
|
|
||||||
|
if (trackAltAlleleStats)
|
||||||
|
this.altAlleleStats = new AltAlleleStatsForSamples(maxGenomicDistanceForMNP);
|
||||||
|
|
||||||
|
this.logger = logger;
|
||||||
|
this.takeOwnershipOfInner = takeOwnershipOfInner;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
|
||||||
|
this(innerWriter, referenceFile, maxGenomicDistanceForMNP, logger, false, false); // by default, don't own inner and don't keep track of alt allele statistics
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeHeader(VCFHeader header) {
|
||||||
|
innerWriter.writeHeader(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
stopWaitingToMerge();
|
||||||
|
|
||||||
|
if (takeOwnershipOfInner)
|
||||||
|
innerWriter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(VariantContext vc, byte refBase) {
|
||||||
|
logger.debug("Next VC input = " + VariantContextUtils.getLocation(vc));
|
||||||
|
boolean curVcIsNotFiltered = vc.isNotFiltered();
|
||||||
|
|
||||||
|
if (vcfrWaitingToMerge == null) {
|
||||||
|
logger.debug("NOT Waiting to merge...");
|
||||||
|
|
||||||
|
if (!filteredVcfrList.isEmpty())
|
||||||
|
throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!");
|
||||||
|
|
||||||
|
if (curVcIsNotFiltered) { // still need to wait before can release vc
|
||||||
|
logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(vc));
|
||||||
|
vcfrWaitingToMerge = new VCFRecord(vc, refBase);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(vc));
|
||||||
|
innerWriter.add(vc, refBase);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // waiting to merge vcfrWaitingToMerge
|
||||||
|
logger.debug("Waiting to merge " + VariantContextUtils.getLocation(vcfrWaitingToMerge.vc));
|
||||||
|
|
||||||
|
if (!curVcIsNotFiltered) {
|
||||||
|
logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(vc));
|
||||||
|
filteredVcfrList.add(new VCFRecord(vc, refBase));
|
||||||
|
}
|
||||||
|
else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
|
||||||
|
numRecordsAttemptToMerge++;
|
||||||
|
boolean mergeDistanceInRange = (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP);
|
||||||
|
|
||||||
|
/*
|
||||||
|
TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? --
|
||||||
|
If the following 3 genotypes originally exist for a sample [at sites 1, 2, and 3]:
|
||||||
|
1/1
|
||||||
|
0|1
|
||||||
|
0|1
|
||||||
|
|
||||||
|
Then, after merging the first two, we have [at sites 1 and 3]:
|
||||||
|
1/2
|
||||||
|
0|1
|
||||||
|
|
||||||
|
Then, not having merged would consider sites 2 and 3 as a MNP (since it's a diploid het site with haplotypes: REF-REF and ALT-ALT)
|
||||||
|
But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT
|
||||||
|
*/
|
||||||
|
if (altAlleleStats != null)
|
||||||
|
altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, mergeDistanceInRange);
|
||||||
|
|
||||||
|
boolean mergedRecords = false;
|
||||||
|
if (mergeDistanceInRange) {
|
||||||
|
numRecordsWithinDistance++;
|
||||||
|
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
|
||||||
|
if (mergedVc != null) {
|
||||||
|
mergedRecords = true;
|
||||||
|
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase);
|
||||||
|
numMergedRecords++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mergedRecords) {
|
||||||
|
stopWaitingToMerge();
|
||||||
|
vcfrWaitingToMerge = new VCFRecord(vc, refBase);
|
||||||
|
}
|
||||||
|
logger.debug("Merged? = " + mergedRecords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void stopWaitingToMerge() {
|
||||||
|
if (vcfrWaitingToMerge == null) {
|
||||||
|
if (!filteredVcfrList.isEmpty())
|
||||||
|
throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase);
|
||||||
|
vcfrWaitingToMerge = null;
|
||||||
|
|
||||||
|
for (VCFRecord vcfr : filteredVcfrList)
|
||||||
|
innerWriter.add(vcfr.vc, vcfr.refBase);
|
||||||
|
filteredVcfrList.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumRecordsAttemptToMerge() {
|
||||||
|
return numRecordsAttemptToMerge;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumRecordsWithinDistance() {
|
||||||
|
return numRecordsWithinDistance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumMergedRecords() {
|
||||||
|
return numMergedRecords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int minDistance(VariantContext vc1, VariantContext vc2) {
|
||||||
|
return VariantContextUtils.getLocation(vc1).minDistance(VariantContextUtils.getLocation(vc2));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a string representation of this object.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAltAlleleStats() {
|
||||||
|
if (altAlleleStats == null)
|
||||||
|
return "";
|
||||||
|
|
||||||
|
return "\n" + altAlleleStats.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class VCFRecord {
|
||||||
|
public VariantContext vc;
|
||||||
|
public byte refBase;
|
||||||
|
|
||||||
|
public VCFRecord(VariantContext vc, byte refBase) {
|
||||||
|
this.vc = vc;
|
||||||
|
this.refBase = refBase;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class AltAlleleStats {
|
||||||
|
public int numSuccessiveGenotypes;
|
||||||
|
public int numSuccessiveGenotypesWithinDistance;
|
||||||
|
|
||||||
|
public int oneSampleMissing;
|
||||||
|
public int atLeastOneSampleNotCalledOrFiltered;
|
||||||
|
public int segregationUnknown;
|
||||||
|
public int eitherNotVariant;
|
||||||
|
|
||||||
|
public int bothInPairHaveVariant;
|
||||||
|
|
||||||
|
public int ref_ref_pair;
|
||||||
|
public int ref_alt_pair;
|
||||||
|
public int alt_ref_pair;
|
||||||
|
public int alt_alt_pair;
|
||||||
|
|
||||||
|
public int MNPsites;
|
||||||
|
public int CHetSites;
|
||||||
|
|
||||||
|
public AltAlleleStats() {
|
||||||
|
this.numSuccessiveGenotypes = 0;
|
||||||
|
this.numSuccessiveGenotypesWithinDistance = 0;
|
||||||
|
|
||||||
|
this.oneSampleMissing = 0;
|
||||||
|
this.atLeastOneSampleNotCalledOrFiltered = 0;
|
||||||
|
this.segregationUnknown = 0;
|
||||||
|
this.eitherNotVariant = 0;
|
||||||
|
|
||||||
|
this.bothInPairHaveVariant = 0;
|
||||||
|
|
||||||
|
this.ref_ref_pair = 0;
|
||||||
|
this.ref_alt_pair = 0;
|
||||||
|
this.alt_ref_pair = 0;
|
||||||
|
this.alt_alt_pair = 0;
|
||||||
|
|
||||||
|
this.MNPsites = 0;
|
||||||
|
this.CHetSites = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
sb.append("Sample missing:\t" + oneSampleMissing + "\n");
|
||||||
|
sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n");
|
||||||
|
|
||||||
|
sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n");
|
||||||
|
sb.append("Number of successive pairs of genotypes within distance:\t" + numSuccessiveGenotypesWithinDistance + "\n");
|
||||||
|
|
||||||
|
sb.append("Unknown segregation, within distance:\t" + segregationUnknown + "\n");
|
||||||
|
sb.append("Not variant at least one of pair, segregation known, within distance:\t" + eitherNotVariant + "\n");
|
||||||
|
sb.append("* Variant at both, segregation known, within distance:\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
|
||||||
|
|
||||||
|
sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n");
|
||||||
|
sb.append("REF-REF:\t" + ref_ref_pair + "\n");
|
||||||
|
sb.append("REF-ALT:\t" + ref_alt_pair + "\n");
|
||||||
|
sb.append("ALT-REF:\t" + alt_ref_pair + "\n");
|
||||||
|
sb.append("ALT-ALT:\t" + alt_alt_pair + "]\n");
|
||||||
|
|
||||||
|
int hetAfterHetSites = MNPsites + CHetSites;
|
||||||
|
sb.append("* Het-Het sites (with REF allele present at each):\t" + percentageString(hetAfterHetSites, numSuccessiveGenotypesWithinDistance) + "\n");
|
||||||
|
sb.append("* MNPs:\t" + percentageString(MNPsites, hetAfterHetSites) + "\n");
|
||||||
|
sb.append("Compound Hets:\t" + CHetSites + "\n");
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String percentageString(int count, int baseCount) {
|
||||||
|
int NUM_DECIMAL_PLACES = 1;
|
||||||
|
String percent = new Formatter().format("%."+ NUM_DECIMAL_PLACES + "f", MathUtils.percentage(count, baseCount)).toString();
|
||||||
|
return count + " (" + percent + "%)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class AltAlleleStatsForSamples {
|
||||||
|
private Map<String, AltAlleleStats> sampleStats;
|
||||||
|
private int distance;
|
||||||
|
|
||||||
|
public AltAlleleStatsForSamples(int distance) {
|
||||||
|
this.sampleStats = new HashMap<String, AltAlleleStats>();
|
||||||
|
this.distance = distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean mergeDistanceInRange) {
|
||||||
|
if (vc1.isFiltered() || vc2.isFiltered())
|
||||||
|
return;
|
||||||
|
|
||||||
|
Set<String> allSamples = new TreeSet<String>(vc1.getSampleNames());
|
||||||
|
allSamples.addAll(vc2.getSampleNames());
|
||||||
|
|
||||||
|
for (String samp : allSamples) {
|
||||||
|
AltAlleleStats aas = sampleStats.get(samp);
|
||||||
|
if (aas == null) {
|
||||||
|
aas = new AltAlleleStats();
|
||||||
|
sampleStats.put(samp, aas);
|
||||||
|
}
|
||||||
|
|
||||||
|
Genotype gt1 = vc1.getGenotype(samp);
|
||||||
|
Genotype gt2 = vc2.getGenotype(samp);
|
||||||
|
if (gt1 == null || gt2 == null) {
|
||||||
|
aas.oneSampleMissing++;
|
||||||
|
}
|
||||||
|
else if (gt1.isNoCall() || gt1.isFiltered() || gt2.isNoCall() || gt2.isFiltered()) {
|
||||||
|
aas.atLeastOneSampleNotCalledOrFiltered++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
aas.numSuccessiveGenotypes++;
|
||||||
|
|
||||||
|
if (mergeDistanceInRange) {
|
||||||
|
aas.numSuccessiveGenotypesWithinDistance++;
|
||||||
|
|
||||||
|
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
||||||
|
aas.segregationUnknown++;
|
||||||
|
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(vc1) + ", " + VariantContextUtils.getLocation(vc2));
|
||||||
|
}
|
||||||
|
else if (gt1.isHomRef() || gt2.isHomRef()) {
|
||||||
|
aas.eitherNotVariant++;
|
||||||
|
}
|
||||||
|
else { // BOTH gt1 and gt2 have at least one variant allele (so either hets, or homozygous variant):
|
||||||
|
aas.bothInPairHaveVariant++;
|
||||||
|
|
||||||
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
|
for (Allele all1 : site1Alleles) {
|
||||||
|
Allele all2 = all2It.next(); // this is OK, since alleleSegregationIsKnown(gt1, gt2)
|
||||||
|
|
||||||
|
if (all1.isReference()) {
|
||||||
|
if (all2.isReference())
|
||||||
|
aas.ref_ref_pair++;
|
||||||
|
else
|
||||||
|
aas.ref_alt_pair++;
|
||||||
|
}
|
||||||
|
else { // all1.isNonReference()
|
||||||
|
if (all2.isReference())
|
||||||
|
aas.alt_ref_pair++;
|
||||||
|
else
|
||||||
|
aas.alt_alt_pair++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check MNPs vs. CHets:
|
||||||
|
if (containsRefAllele(site1Alleles) && containsRefAllele(site2Alleles)) {
|
||||||
|
logger.debug("HET-HET for " + samp + " at " + VariantContextUtils.getLocation(vc1) + ", " + VariantContextUtils.getLocation(vc2));
|
||||||
|
if (logger.isDebugEnabled() && !(gt1.isHet() && gt2.isHet()))
|
||||||
|
throw new ReviewedStingException("Since !gt1.isHomRef() && !gt2.isHomRef(), yet both have ref alleles, they BOTH must be hets!");
|
||||||
|
|
||||||
|
// There's the potential to only have REF-ALT, ALT-REF (CHet), or possibly ALT-ALT together (MNP)
|
||||||
|
boolean hasMNP = false;
|
||||||
|
|
||||||
|
all2It = site2Alleles.iterator();
|
||||||
|
for (Allele all1 : site1Alleles) {
|
||||||
|
Allele all2 = all2It.next(); // this is OK, since alleleSegregationIsKnown(gt1, gt2)
|
||||||
|
|
||||||
|
if (all1.isNonReference() && all2.isNonReference()) {
|
||||||
|
hasMNP = true; // has at least one haplotype of ALT-ALT that segregates!
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMNP)
|
||||||
|
aas.MNPsites++;
|
||||||
|
else
|
||||||
|
aas.CHetSites++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean containsRefAllele(List<Allele> siteAlleles) {
|
||||||
|
for (Allele all : siteAlleles) {
|
||||||
|
if (all.isReference())
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("-------------------------------------------------------------------------\n");
|
||||||
|
sb.append("Per-sample alternate allele statistics [Merge distance <= " + distance + "]\n");
|
||||||
|
sb.append("-------------------------------------------------------------------------");
|
||||||
|
|
||||||
|
for (Map.Entry<String, AltAlleleStats> sampAltAllStatsEntry : sampleStats.entrySet()) {
|
||||||
|
String samp = sampAltAllStatsEntry.getKey();
|
||||||
|
AltAlleleStats stats = sampAltAllStatsEntry.getValue();
|
||||||
|
sb.append("\n* Sample:\t" + samp + "\n" + stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,175 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
|
||||||
|
|
||||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
|
||||||
import org.broad.tribble.vcf.VCFHeader;
|
|
||||||
import org.broad.tribble.vcf.VCFWriter;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
|
|
||||||
public class MergePhasedSegregatingPolymorphismsToMNPvcfWriter implements VCFWriter {
|
|
||||||
private VCFWriter innerWriter;
|
|
||||||
|
|
||||||
private ReferenceSequenceFile referenceFileForMNPmerging;
|
|
||||||
private int maxGenomicDistanceForMNP;
|
|
||||||
|
|
||||||
private VCFRecord vcfrWaitingToMerge;
|
|
||||||
private List<VCFRecord> filteredVcfrList;
|
|
||||||
private int numRecordsWithinDistance;
|
|
||||||
private int numMergedRecords;
|
|
||||||
|
|
||||||
private Logger logger;
|
|
||||||
|
|
||||||
// Should we call innerWriter.close() in close()
|
|
||||||
private boolean takeOwnershipOfInner;
|
|
||||||
|
|
||||||
public MergePhasedSegregatingPolymorphismsToMNPvcfWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner) {
|
|
||||||
this.innerWriter = innerWriter;
|
|
||||||
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
|
|
||||||
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
|
|
||||||
this.vcfrWaitingToMerge = null;
|
|
||||||
this.filteredVcfrList = new LinkedList<VCFRecord>();
|
|
||||||
this.numRecordsWithinDistance = 0;
|
|
||||||
this.numMergedRecords = 0;
|
|
||||||
this.logger = logger;
|
|
||||||
this.takeOwnershipOfInner = takeOwnershipOfInner;
|
|
||||||
}
|
|
||||||
|
|
||||||
public MergePhasedSegregatingPolymorphismsToMNPvcfWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
|
|
||||||
this(innerWriter, referenceFile, maxGenomicDistanceForMNP, logger, false); // by default, don't own inner
|
|
||||||
}
|
|
||||||
|
|
||||||
public void writeHeader(VCFHeader header) {
|
|
||||||
innerWriter.writeHeader(header);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
stopWaitingToMerge();
|
|
||||||
|
|
||||||
if (takeOwnershipOfInner)
|
|
||||||
innerWriter.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void add(VariantContext vc, byte refBase) {
|
|
||||||
logger.debug("Next VC input = " + VariantContextUtils.getLocation(vc));
|
|
||||||
boolean curVcIsNotFiltered = vc.isNotFiltered();
|
|
||||||
|
|
||||||
if (vcfrWaitingToMerge == null) {
|
|
||||||
logger.debug("NOT Waiting to merge...");
|
|
||||||
|
|
||||||
if (!filteredVcfrList.isEmpty())
|
|
||||||
throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!");
|
|
||||||
|
|
||||||
if (curVcIsNotFiltered) { // still need to wait before can release vc
|
|
||||||
logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(vc));
|
|
||||||
vcfrWaitingToMerge = new VCFRecord(vc, refBase);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(vc));
|
|
||||||
innerWriter.add(vc, refBase);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else { // waiting to merge vcfrWaitingToMerge
|
|
||||||
logger.debug("Waiting to merge " + VariantContextUtils.getLocation(vcfrWaitingToMerge.vc));
|
|
||||||
|
|
||||||
if (!curVcIsNotFiltered) {
|
|
||||||
logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(vc));
|
|
||||||
filteredVcfrList.add(new VCFRecord(vc, refBase));
|
|
||||||
}
|
|
||||||
else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
|
|
||||||
boolean mergedRecords = false;
|
|
||||||
if (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP) {
|
|
||||||
numRecordsWithinDistance++;
|
|
||||||
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
|
|
||||||
if (mergedVc != null) {
|
|
||||||
mergedRecords = true;
|
|
||||||
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase);
|
|
||||||
numMergedRecords++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!mergedRecords) {
|
|
||||||
stopWaitingToMerge();
|
|
||||||
vcfrWaitingToMerge = new VCFRecord(vc, refBase);
|
|
||||||
}
|
|
||||||
logger.debug("Merged? = " + mergedRecords);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void stopWaitingToMerge() {
|
|
||||||
if (vcfrWaitingToMerge == null) {
|
|
||||||
if (!filteredVcfrList.isEmpty())
|
|
||||||
throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase);
|
|
||||||
vcfrWaitingToMerge = null;
|
|
||||||
|
|
||||||
for (VCFRecord vcfr : filteredVcfrList)
|
|
||||||
innerWriter.add(vcfr.vc, vcfr.refBase);
|
|
||||||
filteredVcfrList.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumMergeableRecordsWithinDistance() {
|
|
||||||
return numRecordsWithinDistance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumMergedRecords() {
|
|
||||||
return numMergedRecords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int minDistance(VariantContext vc1, VariantContext vc2) {
|
|
||||||
return VariantContextUtils.getLocation(vc1).minDistance(VariantContextUtils.getLocation(vc2));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets a string representation of this object.
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return getClass().getName();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class VCFRecord {
|
|
||||||
public VariantContext vc;
|
|
||||||
public byte refBase;
|
|
||||||
|
|
||||||
public VCFRecord(VariantContext vc, byte refBase) {
|
|
||||||
this.vc = vc;
|
|
||||||
this.refBase = refBase;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -47,15 +47,18 @@ import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||||
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
|
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
|
||||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
@By(DataSource.REFERENCE_ORDERED_DATA)
|
||||||
|
|
||||||
public class MergeSegregatingPolymorphismsWalker extends RodWalker<Integer, Integer> {
|
public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
@Output(doc = "File to which variants should be written", required = true)
|
@Output(doc = "File to which variants should be written", required = true)
|
||||||
protected VCFWriter writer = null;
|
protected VCFWriter writer = null;
|
||||||
private MergePhasedSegregatingPolymorphismsToMNPvcfWriter vcMergerWriter = null;
|
private MergePhasedSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
|
||||||
|
|
||||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
||||||
protected int maxGenomicDistanceForMNP = 1;
|
protected int maxGenomicDistanceForMNP = 1;
|
||||||
|
|
||||||
|
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
||||||
|
protected boolean disablePrintAlternateAlleleStatistics = false;
|
||||||
|
|
||||||
private LinkedList<String> rodNames = null;
|
private LinkedList<String> rodNames = null;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -67,7 +70,7 @@ public class MergeSegregatingPolymorphismsWalker extends RodWalker<Integer, Inte
|
||||||
|
|
||||||
private void initializeVcfWriter() {
|
private void initializeVcfWriter() {
|
||||||
// false <-> don't take control of writer, since didn't create it:
|
// false <-> don't take control of writer, since didn't create it:
|
||||||
vcMergerWriter = new MergePhasedSegregatingPolymorphismsToMNPvcfWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false);
|
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false, !disablePrintAlternateAlleleStatistics);
|
||||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
||||||
|
|
||||||
// setup the header fields:
|
// setup the header fields:
|
||||||
|
|
@ -134,7 +137,10 @@ public class MergeSegregatingPolymorphismsWalker extends RodWalker<Integer, Inte
|
||||||
*/
|
*/
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer result) {
|
||||||
vcMergerWriter.close();
|
vcMergerWriter.close();
|
||||||
System.out.println("Number of potentially merged records (distance <= "+ maxGenomicDistanceForMNP + "): " + vcMergerWriter.getNumMergeableRecordsWithinDistance());
|
|
||||||
System.out.println("Number of records merged: " + vcMergerWriter.getNumMergedRecords());
|
System.out.println("Number of successive pairs of records (any distance): " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
||||||
|
System.out.println("Number of potentially merged records (distance <= "+ maxGenomicDistanceForMNP + "): " + vcMergerWriter.getNumRecordsWithinDistance());
|
||||||
|
System.out.println("Number of records merged [all samples are mergeable, some sample has a MNP of ALT alleles]: " + vcMergerWriter.getNumMergedRecords());
|
||||||
|
System.out.println(vcMergerWriter.getAltAlleleStats());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -129,8 +129,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
// Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here]
|
// Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here]
|
||||||
VCFWriter origWriter = writer;
|
VCFWriter origWriter = writer;
|
||||||
|
|
||||||
if (enableMergePhasedSegregatingPolymorphismsToMNP)
|
if (enableMergePhasedSegregatingPolymorphismsToMNP) // false <-> don't track the statistics of alternate alleles being merged:
|
||||||
writer = new MergePhasedSegregatingPolymorphismsToMNPvcfWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter);
|
writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter, false);
|
||||||
|
|
||||||
/* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
|
/* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
|
||||||
Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow
|
Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow
|
||||||
|
|
@ -289,7 +289,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
|
|
||||||
logger.debug("sample = " + samp);
|
logger.debug("sample = " + samp);
|
||||||
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
||||||
if (gt.isHom()) {
|
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
||||||
// true <-> can trivially phase a hom site relative to ANY previous site:
|
// true <-> can trivially phase a hom site relative to ANY previous site:
|
||||||
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
||||||
uvc.setGenotype(samp, phasedGt);
|
uvc.setGenotype(samp, phasedGt);
|
||||||
|
|
|
||||||
|
|
@ -5,10 +5,10 @@ import org.junit.Test;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class MergeSegregatingPolymorphismsIntegrationTest extends WalkerTest {
|
public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
public static String baseTestString(String reference, String VCF, int maxDistMNP) {
|
public static String baseTestString(String reference, String VCF, int maxDistMNP) {
|
||||||
return "-T MergeSegregatingPolymorphisms" +
|
return "-T MergeSegregatingAlternateAlleles" +
|
||||||
" -R " + reference +
|
" -R " + reference +
|
||||||
" -B:variant,VCF " + validationDataLocation + VCF +
|
" -B:variant,VCF " + validationDataLocation + VCF +
|
||||||
" --maxGenomicDistanceForMNP " + maxDistMNP +
|
" --maxGenomicDistanceForMNP " + maxDistMNP +
|
||||||
|
|
@ -23,7 +23,7 @@ public class MergeSegregatingPolymorphismsIntegrationTest extends WalkerTest {
|
||||||
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
|
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
|
||||||
+ " -L chr20:556259-756570",
|
+ " -L chr20:556259-756570",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("19d0b2361367024bb9a83b9c15ef2453"));
|
Arrays.asList("e6a14fc97dbd0aaa8e6a4d9a7f1616a6"));
|
||||||
executeTest("Merge MNP het sites within genomic distance of 1 [TEST ONE]", spec);
|
executeTest("Merge MNP het sites within genomic distance of 1 [TEST ONE]", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -33,7 +33,7 @@ public class MergeSegregatingPolymorphismsIntegrationTest extends WalkerTest {
|
||||||
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
|
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
|
||||||
+ " -L chr20:556259-756570",
|
+ " -L chr20:556259-756570",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("f25a6403579dab1395773b3ba365c327"));
|
Arrays.asList("cc2b45c85a51b4998e30758c48f61940"));
|
||||||
executeTest("Merge MNP het sites within genomic distance of 10 [TEST TWO]", spec);
|
executeTest("Merge MNP het sites within genomic distance of 10 [TEST TWO]", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -43,7 +43,7 @@ public class MergeSegregatingPolymorphismsIntegrationTest extends WalkerTest {
|
||||||
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
|
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
|
||||||
+ " -L chr20:556259-756570",
|
+ " -L chr20:556259-756570",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("a064955ffeea7fc4e09512f3e9cdbb9e"));
|
Arrays.asList("47300cc7a5a7d84b3c279f04c4567739"));
|
||||||
executeTest("Merge MNP het sites within genomic distance of 100 [TEST THREE]", spec);
|
executeTest("Merge MNP het sites within genomic distance of 100 [TEST THREE]", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue