diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index d32c840db..c0d818d8b 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -761,45 +761,22 @@ public class VariantContextUtils { if (!mergeIntoMNPvalidationCheck(vc1, vc2)) return null; - Map allele1ToAllele2 = calcMergeAllele1WithAllele2Map(vc1, vc2); - if (allele1ToAllele2 == null) + // Check that it's logically possible to merge the VCs, and that there's a point in doing so (e.g., annotations could be changed): + if (!allSamplesAreMergeable(vc1, vc2) || !someSampleHasDoubleNonReferenceAllele(vc1, vc2)) return null; - return reallyMergeIntoMNP(vc1, vc2, referenceFile, allele1ToAllele2); + return reallyMergeIntoMNP(vc1, vc2, referenceFile); } - private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, Map allele1ToAllele2) { + private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { int startInter = vc1.getEnd() + 1; int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; - int intermediateLength = 0; if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); - intermediateLength = intermediateBases.length; } - - Map allele1ToMergedAllele = new HashMap(); - for (Map.Entry all1all2Entry : allele1ToAllele2.entrySet()) { - Allele all1 = all1all2Entry.getKey(); - Allele all2 = all1all2Entry.getValue(); - - byte[] bases1 = all1.getBases(); - byte[] bases2 = all2.getBases(); - - byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; - System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); - if (intermediateBases != null) - System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); - System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); - - Allele mergedAllele = Allele.create(mergedBases, all1.equals(vc1.getReference())); - allele1ToMergedAllele.put(all1, mergedAllele); - } - - Set allMergedAlleles = new HashSet(); - Allele mergedRefAllele = allele1ToMergedAllele.get(vc1.getReference()); - allMergedAlleles.add(mergedRefAllele); // ensure that the reference allele is added + MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added Map mergedGenotypes = new HashMap(); for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { @@ -808,10 +785,15 @@ public class VariantContextUtils { Genotype gt2 = vc2.getGenotype(sample); List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + List mergedAllelesForSample = new LinkedList(); + + Iterator all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { - Allele mergedAllele = allele1ToMergedAllele.get(all1); - allMergedAlleles.add(mergedAllele); + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); mergedAllelesForSample.add(mergedAllele); } @@ -823,7 +805,7 @@ public class VariantContextUtils { if (PQ != null) mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, PQ); - Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, gt1.genotypesArePhased()); + Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, calcPhaseForMergedGenotypes(gt1, gt2)); mergedGenotypes.put(sample, mergedGt); } @@ -834,7 +816,7 @@ public class VariantContextUtils { if (mergedAttribs == null) return null; - VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), allMergedAlleles, mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); + VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); /* Calculate VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY from scratch [though technically they should already be consistent with each of vc1 and vc2's respective alleles]: @@ -846,6 +828,71 @@ public class VariantContextUtils { return mergedVc; } + private static class AlleleOneAndTwo { + private Allele all1; + private Allele all2; + + public AlleleOneAndTwo(Allele all1, Allele all2) { + this.all1 = all1; + this.all2 = all2; + } + + public int hashCode() { + return all1.hashCode() + all2.hashCode(); + } + + public boolean equals(Object other) { + if (!(other instanceof AlleleOneAndTwo)) + return false; + + AlleleOneAndTwo otherAot = (AlleleOneAndTwo)other; + return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); + } + } + + private static class MergedAllelesData { + private Map mergedAlleles; + private byte[] intermediateBases; + private int intermediateLength; + + public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { + this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo + this.intermediateBases = intermediateBases; + this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + + this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); + } + + public Allele ensureMergedAllele(Allele all1, Allele all2) { + return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor + } + + private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { + AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); + Allele mergedAllele = mergedAlleles.get(all12); + + if (mergedAllele == null) { + byte[] bases1 = all1.getBases(); + byte[] bases2 = all2.getBases(); + + byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; + System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); + if (intermediateBases != null) + System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); + System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); + + mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); + mergedAlleles.put(all12, mergedAllele); + } + + return mergedAllele; + } + + public Set getAllMergedAlleles() { + return new HashSet(mergedAlleles.values()); + } + } + private static String mergeVariantContextNames(String name1, String name2) { return name1 + "_" + name2; } @@ -935,49 +982,76 @@ public class VariantContextUtils { return true; } - private static Map calcMergeAllele1WithAllele2Map(VariantContext vc1, VariantContext vc2) { - // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): - Map allele1ToAllele2 = new HashMap(); - Map allele2ToAllele1 = new HashMap(); - - // Note the segregation of the alleles for the reference genome: - allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); - allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); - - // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). - // [Also, check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1.] + // Assumes that vc1 and vc2 were already checked to have the same sample names: + private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { + // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); Genotype gt2 = vc2.getGenotype(sample); - if (gt1.getPloidy() != gt2.getPloidy()) - return null; + if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom + return false; + } - if (!gt2.genotypesArePhased() && !gt2.isHom()) // since can merge if phased or even if an unphased hom - return null; + return true; + } + + public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + if (gt1.getPloidy() != gt2.getPloidy()) + return false; + + /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. + + HOWEVER, EVEN if this is not the case, but gt1.isHom(), + it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. + */ + return (gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()); + } + + // Assumes that alleleSegregationIsKnown(gt1, gt2): + private static boolean calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { + if (gt2.genotypesArePhased() || gt2.isHom()) + return gt1.genotypesArePhased(); // maintain the phase of gt1 + + if (!gt1.isHom()) + throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); + + /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype + + For example, if we're merging the third Genotype with the second one: + 0/1 + 1|1 + 0/1 + + Then, we want to output: + 0/1 + 1/2 + */ + return false; // maintain the phase of gt2 [since !gt2.genotypesArePhased()] + } + + /* Checks if any sample has a MNP of ALT alleles (segregating together): + [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] + */ + private static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); List site1Alleles = gt1.getAlleles(); List site2Alleles = gt2.getAlleles(); Iterator all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - Allele all1To2 = allele1ToAllele2.get(all1); - if (all1To2 == null) - allele1ToAllele2.put(all1, all2); - else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 - return null; - - Allele all2To1 = allele2ToAllele1.get(all2); - if (all2To1 == null) - allele2ToAllele1.put(all2, all1); - else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 - return null; + if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate + return true; } } - return allele1ToAllele2; + return false; } -} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java new file mode 100644 index 000000000..d03a5d21f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.util.*; + +// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts + +public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWriter { + private VCFWriter innerWriter; + + private ReferenceSequenceFile referenceFileForMNPmerging; + private int maxGenomicDistanceForMNP; + + private VCFRecord vcfrWaitingToMerge; + private List filteredVcfrList; + + private int numRecordsAttemptToMerge; + private int numRecordsWithinDistance; + private int numMergedRecords; + private AltAlleleStatsForSamples altAlleleStats = null; + + private Logger logger; + + // Should we call innerWriter.close() in close() + private boolean takeOwnershipOfInner; + + public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { + this.innerWriter = innerWriter; + this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile); + this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP; + this.vcfrWaitingToMerge = null; + this.filteredVcfrList = new LinkedList(); + this.numRecordsWithinDistance = 0; + this.numMergedRecords = 0; + + if (trackAltAlleleStats) + this.altAlleleStats = new AltAlleleStatsForSamples(maxGenomicDistanceForMNP); + + this.logger = logger; + this.takeOwnershipOfInner = takeOwnershipOfInner; + } + + public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) { + this(innerWriter, referenceFile, maxGenomicDistanceForMNP, logger, false, false); // by default, don't own inner and don't keep track of alt allele statistics + } + + public void writeHeader(VCFHeader header) { + innerWriter.writeHeader(header); + } + + public void close() { + stopWaitingToMerge(); + + if (takeOwnershipOfInner) + innerWriter.close(); + } + + public void add(VariantContext vc, byte refBase) { + logger.debug("Next VC input = " + VariantContextUtils.getLocation(vc)); + boolean curVcIsNotFiltered = vc.isNotFiltered(); + + if (vcfrWaitingToMerge == null) { + logger.debug("NOT Waiting to merge..."); + + if (!filteredVcfrList.isEmpty()) + throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!"); + + if (curVcIsNotFiltered) { // still need to wait before can release vc + logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(vc)); + vcfrWaitingToMerge = new VCFRecord(vc, refBase); + } + else { + logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(vc)); + innerWriter.add(vc, refBase); + } + } + else { // waiting to merge vcfrWaitingToMerge + logger.debug("Waiting to merge " + VariantContextUtils.getLocation(vcfrWaitingToMerge.vc)); + + if (!curVcIsNotFiltered) { + logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(vc)); + filteredVcfrList.add(new VCFRecord(vc, refBase)); + } + else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: + numRecordsAttemptToMerge++; + boolean mergeDistanceInRange = (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP); + + /* + TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? -- + If the following 3 genotypes originally exist for a sample [at sites 1, 2, and 3]: + 1/1 + 0|1 + 0|1 + + Then, after merging the first two, we have [at sites 1 and 3]: + 1/2 + 0|1 + + Then, not having merged would consider sites 2 and 3 as a MNP (since it's a diploid het site with haplotypes: REF-REF and ALT-ALT) + But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT + */ + if (altAlleleStats != null) + altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, mergeDistanceInRange); + + boolean mergedRecords = false; + if (mergeDistanceInRange) { + numRecordsWithinDistance++; + VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging); + if (mergedVc != null) { + mergedRecords = true; + vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase); + numMergedRecords++; + } + } + + if (!mergedRecords) { + stopWaitingToMerge(); + vcfrWaitingToMerge = new VCFRecord(vc, refBase); + } + logger.debug("Merged? = " + mergedRecords); + } + } + } + + private void stopWaitingToMerge() { + if (vcfrWaitingToMerge == null) { + if (!filteredVcfrList.isEmpty()) + throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!"); + return; + } + + innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase); + vcfrWaitingToMerge = null; + + for (VCFRecord vcfr : filteredVcfrList) + innerWriter.add(vcfr.vc, vcfr.refBase); + filteredVcfrList.clear(); + } + + public int getNumRecordsAttemptToMerge() { + return numRecordsAttemptToMerge; + } + + public int getNumRecordsWithinDistance() { + return numRecordsWithinDistance; + } + + public int getNumMergedRecords() { + return numMergedRecords; + } + + public static int minDistance(VariantContext vc1, VariantContext vc2) { + return VariantContextUtils.getLocation(vc1).minDistance(VariantContextUtils.getLocation(vc2)); + } + + /** + * Gets a string representation of this object. + * + * @return + */ + @Override + public String toString() { + return getClass().getName(); + } + + public String getAltAlleleStats() { + if (altAlleleStats == null) + return ""; + + return "\n" + altAlleleStats.toString(); + } + + private static class VCFRecord { + public VariantContext vc; + public byte refBase; + + public VCFRecord(VariantContext vc, byte refBase) { + this.vc = vc; + this.refBase = refBase; + } + } + + private class AltAlleleStats { + public int numSuccessiveGenotypes; + public int numSuccessiveGenotypesWithinDistance; + + public int oneSampleMissing; + public int atLeastOneSampleNotCalledOrFiltered; + public int segregationUnknown; + public int eitherNotVariant; + + public int bothInPairHaveVariant; + + public int ref_ref_pair; + public int ref_alt_pair; + public int alt_ref_pair; + public int alt_alt_pair; + + public int MNPsites; + public int CHetSites; + + public AltAlleleStats() { + this.numSuccessiveGenotypes = 0; + this.numSuccessiveGenotypesWithinDistance = 0; + + this.oneSampleMissing = 0; + this.atLeastOneSampleNotCalledOrFiltered = 0; + this.segregationUnknown = 0; + this.eitherNotVariant = 0; + + this.bothInPairHaveVariant = 0; + + this.ref_ref_pair = 0; + this.ref_alt_pair = 0; + this.alt_ref_pair = 0; + this.alt_alt_pair = 0; + + this.MNPsites = 0; + this.CHetSites = 0; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append("Sample missing:\t" + oneSampleMissing + "\n"); + sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n"); + + sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n"); + sb.append("Number of successive pairs of genotypes within distance:\t" + numSuccessiveGenotypesWithinDistance + "\n"); + + sb.append("Unknown segregation, within distance:\t" + segregationUnknown + "\n"); + sb.append("Not variant at least one of pair, segregation known, within distance:\t" + eitherNotVariant + "\n"); + sb.append("* Variant at both, segregation known, within distance:\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n"); + + sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n"); + sb.append("REF-REF:\t" + ref_ref_pair + "\n"); + sb.append("REF-ALT:\t" + ref_alt_pair + "\n"); + sb.append("ALT-REF:\t" + alt_ref_pair + "\n"); + sb.append("ALT-ALT:\t" + alt_alt_pair + "]\n"); + + int hetAfterHetSites = MNPsites + CHetSites; + sb.append("* Het-Het sites (with REF allele present at each):\t" + percentageString(hetAfterHetSites, numSuccessiveGenotypesWithinDistance) + "\n"); + sb.append("* MNPs:\t" + percentageString(MNPsites, hetAfterHetSites) + "\n"); + sb.append("Compound Hets:\t" + CHetSites + "\n"); + + return sb.toString(); + } + + private String percentageString(int count, int baseCount) { + int NUM_DECIMAL_PLACES = 1; + String percent = new Formatter().format("%."+ NUM_DECIMAL_PLACES + "f", MathUtils.percentage(count, baseCount)).toString(); + return count + " (" + percent + "%)"; + } + } + + private class AltAlleleStatsForSamples { + private Map sampleStats; + private int distance; + + public AltAlleleStatsForSamples(int distance) { + this.sampleStats = new HashMap(); + this.distance = distance; + } + + public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean mergeDistanceInRange) { + if (vc1.isFiltered() || vc2.isFiltered()) + return; + + Set allSamples = new TreeSet(vc1.getSampleNames()); + allSamples.addAll(vc2.getSampleNames()); + + for (String samp : allSamples) { + AltAlleleStats aas = sampleStats.get(samp); + if (aas == null) { + aas = new AltAlleleStats(); + sampleStats.put(samp, aas); + } + + Genotype gt1 = vc1.getGenotype(samp); + Genotype gt2 = vc2.getGenotype(samp); + if (gt1 == null || gt2 == null) { + aas.oneSampleMissing++; + } + else if (gt1.isNoCall() || gt1.isFiltered() || gt2.isNoCall() || gt2.isFiltered()) { + aas.atLeastOneSampleNotCalledOrFiltered++; + } + else { + aas.numSuccessiveGenotypes++; + + if (mergeDistanceInRange) { + aas.numSuccessiveGenotypesWithinDistance++; + + if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) { + aas.segregationUnknown++; + logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(vc1) + ", " + VariantContextUtils.getLocation(vc2)); + } + else if (gt1.isHomRef() || gt2.isHomRef()) { + aas.eitherNotVariant++; + } + else { // BOTH gt1 and gt2 have at least one variant allele (so either hets, or homozygous variant): + aas.bothInPairHaveVariant++; + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since alleleSegregationIsKnown(gt1, gt2) + + if (all1.isReference()) { + if (all2.isReference()) + aas.ref_ref_pair++; + else + aas.ref_alt_pair++; + } + else { // all1.isNonReference() + if (all2.isReference()) + aas.alt_ref_pair++; + else + aas.alt_alt_pair++; + } + } + + // Check MNPs vs. CHets: + if (containsRefAllele(site1Alleles) && containsRefAllele(site2Alleles)) { + logger.debug("HET-HET for " + samp + " at " + VariantContextUtils.getLocation(vc1) + ", " + VariantContextUtils.getLocation(vc2)); + if (logger.isDebugEnabled() && !(gt1.isHet() && gt2.isHet())) + throw new ReviewedStingException("Since !gt1.isHomRef() && !gt2.isHomRef(), yet both have ref alleles, they BOTH must be hets!"); + + // There's the potential to only have REF-ALT, ALT-REF (CHet), or possibly ALT-ALT together (MNP) + boolean hasMNP = false; + + all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since alleleSegregationIsKnown(gt1, gt2) + + if (all1.isNonReference() && all2.isNonReference()) { + hasMNP = true; // has at least one haplotype of ALT-ALT that segregates! + break; + } + } + + if (hasMNP) + aas.MNPsites++; + else + aas.CHetSites++; + } + } + } + } + } + } + + private boolean containsRefAllele(List siteAlleles) { + for (Allele all : siteAlleles) { + if (all.isReference()) + return true; + } + + return false; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("-------------------------------------------------------------------------\n"); + sb.append("Per-sample alternate allele statistics [Merge distance <= " + distance + "]\n"); + sb.append("-------------------------------------------------------------------------"); + + for (Map.Entry sampAltAllStatsEntry : sampleStats.entrySet()) { + String samp = sampAltAllStatsEntry.getKey(); + AltAlleleStats stats = sampAltAllStatsEntry.getValue(); + sb.append("\n* Sample:\t" + samp + "\n" + stats); + } + + return sb.toString(); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingPolymorphismsToMNPvcfWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingPolymorphismsToMNPvcfWriter.java deleted file mode 100644 index f17d41408..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingPolymorphismsToMNPvcfWriter.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFile; -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFWriter; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; -import java.util.LinkedList; -import java.util.List; - -// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts -public class MergePhasedSegregatingPolymorphismsToMNPvcfWriter implements VCFWriter { - private VCFWriter innerWriter; - - private ReferenceSequenceFile referenceFileForMNPmerging; - private int maxGenomicDistanceForMNP; - - private VCFRecord vcfrWaitingToMerge; - private List filteredVcfrList; - private int numRecordsWithinDistance; - private int numMergedRecords; - - private Logger logger; - - // Should we call innerWriter.close() in close() - private boolean takeOwnershipOfInner; - - public MergePhasedSegregatingPolymorphismsToMNPvcfWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner) { - this.innerWriter = innerWriter; - this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile); - this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP; - this.vcfrWaitingToMerge = null; - this.filteredVcfrList = new LinkedList(); - this.numRecordsWithinDistance = 0; - this.numMergedRecords = 0; - this.logger = logger; - this.takeOwnershipOfInner = takeOwnershipOfInner; - } - - public MergePhasedSegregatingPolymorphismsToMNPvcfWriter(VCFWriter innerWriter, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) { - this(innerWriter, referenceFile, maxGenomicDistanceForMNP, logger, false); // by default, don't own inner - } - - public void writeHeader(VCFHeader header) { - innerWriter.writeHeader(header); - } - - public void close() { - stopWaitingToMerge(); - - if (takeOwnershipOfInner) - innerWriter.close(); - } - - public void add(VariantContext vc, byte refBase) { - logger.debug("Next VC input = " + VariantContextUtils.getLocation(vc)); - boolean curVcIsNotFiltered = vc.isNotFiltered(); - - if (vcfrWaitingToMerge == null) { - logger.debug("NOT Waiting to merge..."); - - if (!filteredVcfrList.isEmpty()) - throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!"); - - if (curVcIsNotFiltered) { // still need to wait before can release vc - logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(vc)); - vcfrWaitingToMerge = new VCFRecord(vc, refBase); - } - else { - logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(vc)); - innerWriter.add(vc, refBase); - } - } - else { // waiting to merge vcfrWaitingToMerge - logger.debug("Waiting to merge " + VariantContextUtils.getLocation(vcfrWaitingToMerge.vc)); - - if (!curVcIsNotFiltered) { - logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(vc)); - filteredVcfrList.add(new VCFRecord(vc, refBase)); - } - else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: - boolean mergedRecords = false; - if (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP) { - numRecordsWithinDistance++; - VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging); - if (mergedVc != null) { - mergedRecords = true; - vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase); - numMergedRecords++; - } - } - if (!mergedRecords) { - stopWaitingToMerge(); - vcfrWaitingToMerge = new VCFRecord(vc, refBase); - } - logger.debug("Merged? = " + mergedRecords); - } - } - } - - private void stopWaitingToMerge() { - if (vcfrWaitingToMerge == null) { - if (!filteredVcfrList.isEmpty()) - throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!"); - return; - } - - innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase); - vcfrWaitingToMerge = null; - - for (VCFRecord vcfr : filteredVcfrList) - innerWriter.add(vcfr.vc, vcfr.refBase); - filteredVcfrList.clear(); - } - - public int getNumMergeableRecordsWithinDistance() { - return numRecordsWithinDistance; - } - - public int getNumMergedRecords() { - return numMergedRecords; - } - - public static int minDistance(VariantContext vc1, VariantContext vc2) { - return VariantContextUtils.getLocation(vc1).minDistance(VariantContextUtils.getLocation(vc2)); - } - - /** - * Gets a string representation of this object. - * @return - */ - @Override - public String toString() { - return getClass().getName(); - } - - private static class VCFRecord { - public VariantContext vc; - public byte refBase; - - public VCFRecord(VariantContext vc, byte refBase) { - this.vc = vc; - this.refBase = refBase; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingPolymorphismsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java similarity index 83% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingPolymorphismsWalker.java rename to java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java index f922f89c7..f0cb8cea6 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingPolymorphismsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java @@ -47,15 +47,18 @@ import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods; @Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) @By(DataSource.REFERENCE_ORDERED_DATA) -public class MergeSegregatingPolymorphismsWalker extends RodWalker { +public class MergeSegregatingAlternateAllelesWalker extends RodWalker { @Output(doc = "File to which variants should be written", required = true) protected VCFWriter writer = null; - private MergePhasedSegregatingPolymorphismsToMNPvcfWriter vcMergerWriter = null; + private MergePhasedSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false) protected int maxGenomicDistanceForMNP = 1; + @Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false) + protected boolean disablePrintAlternateAlleleStatistics = false; + private LinkedList rodNames = null; public void initialize() { @@ -67,7 +70,7 @@ public class MergeSegregatingPolymorphismsWalker extends RodWalker don't take control of writer, since didn't create it: - vcMergerWriter = new MergePhasedSegregatingPolymorphismsToMNPvcfWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false); + vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false, !disablePrintAlternateAlleleStatistics); writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] // setup the header fields: @@ -134,7 +137,10 @@ public class MergeSegregatingPolymorphismsWalker extends RodWalker don't track the statistics of alternate alleles being merged: + writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter, false); /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow @@ -289,7 +289,7 @@ public class ReadBackedPhasingWalker extends RodWalker