diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java new file mode 100644 index 000000000..5a32479ab --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public abstract class BaseArray implements Comparable { + protected Byte[] bases; + + public BaseArray(byte[] bases) { + this.bases = new Byte[bases.length]; + for (int i = 0; i < bases.length; i++) + this.bases[i] = bases[i]; + } + + public BaseArray(Byte[] bases) { + this.bases = Arrays.copyOf(bases, bases.length); + } + + public BaseArray(int length) { + this.bases = new Byte[length]; + Arrays.fill(this.bases, null); + } + + public BaseArray(BaseArray other) { + this(other.bases); + } + + public void updateBase(int index, Byte base) { + bases[index] = base; + } + + public Byte getBase(int index) { + return bases[index]; + } + + public int size() { + return bases.length; + } + + public String toString() { + StringBuilder sb = new StringBuilder(bases.length); + for (Byte b : bases) + sb.append(b != null ? (char) b.byteValue() : "_"); + + return sb.toString(); + } + + public int compareTo(BaseArray that) { + int sz = this.bases.length; + if (sz != that.bases.length) + return (sz - that.bases.length); + + for (int i = 0; i < sz; i++) { + Byte thisBase = this.getBase(i); + Byte thatBase = that.getBase(i); + if (thisBase == null || thatBase == null) { + if (thisBase == null && thatBase != null) { + return -1; + } + else if (thisBase != null && thatBase == null) { + return 1; + } + } + else if (!BaseUtils.extendedBasesAreEqual(thisBase, thatBase)) { + return thisBase - thatBase; + } + } + return 0; + } + + public int[] getNonNullIndices() { + List nonNull = new LinkedList(); + for (int i = 0; i < bases.length; i++) { + if (getBase(i) != null) + nonNull.add(i); + } + + int[] nonNullArray = new int[nonNull.size()]; + int index = 0; + for (int i : nonNull) + nonNullArray[index++] = i; + return nonNullArray; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java new file mode 100644 index 000000000..3c20a311e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; + +public class Haplotype extends BaseArray implements Cloneable { + public Haplotype(byte[] bases) { + super(bases); + } + + private Haplotype(Byte[] bases) { + super(bases); + } + + public Haplotype(Haplotype other) { + super(other); + } + + public Haplotype(BaseArray baseArr) { + super(baseArr.bases); + + if (baseArr.getNonNullIndices().length != baseArr.bases.length) + throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!"); + } + + public void updateBase(int index, Byte base) { + if (base == null) { + throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); + } + super.updateBase(index, base); + } + + public Haplotype clone() { + try { + super.clone(); + } catch (CloneNotSupportedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return new Haplotype(this); + } + + // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. + + public Haplotype subHaplotype(int fromIndex, int toIndex) { + return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java new file mode 100644 index 000000000..343196bb2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.PreciseNonNegativeDouble; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; + +public class PhasingRead extends BaseArray { + private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly + private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT + private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT + + public PhasingRead(int length, int mappingQual) { + super(length); + + this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); + + this.baseProbs = new PreciseNonNegativeDouble[length]; + Arrays.fill(this.baseProbs, null); + + this.baseErrorProbs = new PreciseNonNegativeDouble[length]; + Arrays.fill(this.baseErrorProbs, null); + } + + public void updateBaseAndQuality(int index, Byte base, byte baseQual) { + updateBase(index, base); + + double errProb = QualityUtils.qualToErrorProb(baseQual); + + // The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]: + errProb = Math.max(errProb, 1.0 - mappingProb.getValue()); + + baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read + baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases + } + + public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) { + PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0); + for (Haplotype h : hapClass) + value.plusEqual(matchHaplotypeScore(h)); + + return new PhasingScore(value); + } + + private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) { + PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0); + + int sz = this.bases.length; + if (sz != hap.bases.length) + throw new ReviewedStingException("Read and Haplotype should have same length to be compared!"); + + // Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness: + score.timesEqual(mappingProb); + + for (int i = 0; i < sz; i++) { + Byte thisBase = this.getBase(i); + Byte hapBase = hap.getBase(i); + if (thisBase != null && hapBase != null) { + if (BaseUtils.basesAreEqual(thisBase, hapBase)) + score.timesEqual(baseProbs[i]); + else + score.timesEqual(baseErrorProbs[i]); + } + } + + return score; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index cdf3ed33f..050d99a8c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -21,7 +21,6 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ - package org.broadinstitute.sting.gatk.walkers.phasing; import org.broad.tribble.util.variantcontext.Allele; @@ -202,7 +201,7 @@ public class ReadBackedPhasingWalker extends RodWalker unprocessedList = new LinkedList(); @@ -210,13 +209,14 @@ public class ReadBackedPhasingWalker extends RodWalker samplesToPhase) { // for ( String sample : samplesToPhase ) // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values()); // logger.debug("original VC = " + vc); // logger.debug("sub VC = " + subvc); - return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF ); + return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); } private List processQueue(PhasingStats phaseStats, boolean processAll) { @@ -255,7 +256,7 @@ public class ReadBackedPhasingWalker extends RodWalker). Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. @@ -266,16 +267,17 @@ public class ReadBackedPhasingWalker extends RodWalker prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; /* Notes: @@ -350,15 +352,17 @@ public class ReadBackedPhasingWalker extends RodWalker gtAttribs = new HashMap(gt.getAttributes()); @@ -389,7 +393,7 @@ public class ReadBackedPhasingWalker extends RodWalker prevHetAndInteriorIt = null; private int phasingSiteIndex = -1; - private Map readsAtHetSites = null; + private Map readsAtHetSites = null; public boolean hasPreviousHets() { return phasingSiteIndex > 0; @@ -458,7 +462,7 @@ public class ReadBackedPhasingWalker extends RodWalker listHetGenotypes, String sample, GenomeLoc phasingLoc, Set onlyKeepReads) { - readsAtHetSites = new HashMap(); + readsAtHetSites = new HashMap(); int index = 0; for (GenotypeAndReadBases grb : listHetGenotypes) { @@ -542,9 +548,9 @@ public class ReadBackedPhasingWalker extends RodWalker nameToReads : readsAtHetSites.entrySet()) { + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { String rdName = nameToReads.getKey(); - Read rd = nameToReads.getValue(); + PhasingRead rd = nameToReads.getValue(); logger.debug(rd + "\t" + rdName); } } @@ -620,16 +626,16 @@ public class ReadBackedPhasingWalker extends RodWalker sitesWithEdges = new TreeSet(); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { String rdName = nameToReads.getKey(); - Read rd = nameToReads.getValue(); + PhasingRead rd = nameToReads.getValue(); int[] siteInds = rd.getNonNullIndices(); // Connect each pair of non-null sites in rd: for (int i = 0; i < siteInds.length; i++) { for (int j = i + 1; j < siteInds.length; j++) { GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]); - if ( DEBUG ) logger.debug("Read = " + rdName + " is adding edge: " + e); + if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e); readGraph.addEdge(e); edgeToReads.addRead(e, rdName); @@ -639,7 +645,7 @@ public class ReadBackedPhasingWalker extends RodWalker keepReads = new HashSet(); /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): @@ -692,7 +698,8 @@ public class ReadBackedPhasingWalker extends RodWalker removedEdges = readGraph.removeAllIncidentEdges(i); @@ -713,15 +720,15 @@ public class ReadBackedPhasingWalker extends RodWalker v1 -> v2 ---> cur [or vice versa]. @@ -746,13 +753,13 @@ public class ReadBackedPhasingWalker extends RodWalker> readIt = readsAtHetSites.entrySet().iterator(); + Iterator> readIt = readsAtHetSites.entrySet().iterator(); while (readIt.hasNext()) { - Map.Entry nameToReads = readIt.next(); + Map.Entry nameToReads = readIt.next(); String rdName = nameToReads.getKey(); if (!keepReads.contains(rdName)) { readIt.remove(); - if ( DEBUG ) logger.debug("Removing extraneous read: " + rdName); + if (DEBUG) logger.debug("Removing extraneous read: " + rdName); } } @@ -761,8 +768,8 @@ public class ReadBackedPhasingWalker extends RodWalker removeExtraneousSites(List listHetGenotypes) { Set sitesWithReads = new HashSet(); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - Read rd = nameToReads.getValue(); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); for (int i : rd.getNonNullIndices()) sitesWithReads.add(i); } @@ -779,7 +786,8 @@ public class ReadBackedPhasingWalker extends RodWalker trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { - if ( DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); + if (DEBUG) + logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! @@ -815,7 +824,8 @@ public class ReadBackedPhasingWalker extends RodWalker nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { String rdName = nameToReads.getKey(); - Read rd = nameToReads.getValue(); + PhasingRead rd = nameToReads.getValue(); logger.debug(rd + "\t" + rdName); } } @@ -847,19 +857,20 @@ public class ReadBackedPhasingWalker extends RodWalker nameToReads : phaseWindow.readsAtHetSites.entrySet()) { - Read rd = nameToReads.getValue(); - if ( DEBUG ) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); + if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); for (PhasingTable.PhasingTableEntry pte : sampleHaps) { PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); pte.getScore().integrateReadScore(score); - if ( DEBUG ) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); + if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); } // Check the current best haplotype assignment and compare it to the previous one: MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); - if ( DEBUG ) logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); + if (DEBUG) + logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); if (prevMaxHapAndQual != null) { double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; @@ -867,7 +878,7 @@ public class ReadBackedPhasingWalker extends RodWalker 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ - if ( DEBUG ) logger.debug("Inconsistent read found!"); + if (DEBUG) logger.debug("Inconsistent read found!"); numInconsistentIterations++; } } @@ -878,12 +889,14 @@ public class ReadBackedPhasingWalker extends RodWalker MAX_FRACTION_OF_INCONSISTENT_READS) @@ -951,7 +964,7 @@ public class ReadBackedPhasingWalker extends RodWalker { - // list of: - private LinkedList bases; - - public ReadBasesAtPosition() { - this.bases = new LinkedList(); - } - - public void putReadBase(PileupElement pue) { - ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual()); - bases.add(rb); - } - - public Iterator iterator() { - return bases.iterator(); - } - - public boolean isEmpty() { - return bases.isEmpty(); - } - } - // // THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() // FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH @@ -1432,7 +1409,7 @@ public class ReadBackedPhasingWalker extends RodWalker { - protected Byte[] bases; - - public BaseArray(byte[] bases) { - this.bases = new Byte[bases.length]; - for (int i = 0; i < bases.length; i++) - this.bases[i] = bases[i]; - } - - public BaseArray(Byte[] bases) { - this.bases = Arrays.copyOf(bases, bases.length); - } - - public BaseArray(int length) { - this.bases = new Byte[length]; - Arrays.fill(this.bases, null); - } - - public BaseArray(BaseArray other) { - this(other.bases); - } - - public void updateBase(int index, Byte base) { - bases[index] = base; - } - - public Byte getBase(int index) { - return bases[index]; - } - - public int size() { - return bases.length; - } - - public String toString() { - StringBuilder sb = new StringBuilder(bases.length); - for (Byte b : bases) - sb.append(b != null ? (char) b.byteValue() : "_"); - - return sb.toString(); - } - - public int compareTo(BaseArray that) { - int sz = this.bases.length; - if (sz != that.bases.length) - return (sz - that.bases.length); - - for (int i = 0; i < sz; i++) { - Byte thisBase = this.getBase(i); - Byte thatBase = that.getBase(i); - if (thisBase == null || thatBase == null) { - if (thisBase == null && thatBase != null) { - return -1; - } - else if (thisBase != null && thatBase == null) { - return 1; - } - } - else if (!BaseUtils.basesAreEqual(thisBase, thatBase)) { - return thisBase - thatBase; - } - } - return 0; - } -} - -class Haplotype extends BaseArray implements Cloneable { - public Haplotype(byte[] bases) { - super(bases); - } - - private Haplotype(Byte[] bases) { - super(bases); - } - - public Haplotype(Haplotype other) { - super(other); - } - - public void updateBase(int index, Byte base) { - if (base == null) { - throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); - } - super.updateBase(index, base); - } - - public Haplotype clone() { - try { - super.clone(); - } catch (CloneNotSupportedException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } - return new Haplotype(this); - } - - // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. - - public Haplotype subHaplotype(int fromIndex, int toIndex) { - return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); - } -} - class HaplotypeClass implements Iterable { private ArrayList haps; private Haplotype rep; @@ -1619,82 +1494,6 @@ class HaplotypeClass implements Iterable { } } -class Read extends BaseArray { - private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly - private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT - private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT - - public Read(int length, int mappingQual) { - super(length); - - this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); - - this.baseProbs = new PreciseNonNegativeDouble[length]; - Arrays.fill(this.baseProbs, null); - - this.baseErrorProbs = new PreciseNonNegativeDouble[length]; - Arrays.fill(this.baseErrorProbs, null); - } - - public void updateBaseAndQuality(int index, Byte base, byte baseQual) { - updateBase(index, base); - - double errProb = QualityUtils.qualToErrorProb(baseQual); - - // The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]: - errProb = Math.max(errProb, 1.0 - mappingProb.getValue()); - - baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read - baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases - } - - public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) { - PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0); - for (Haplotype h : hapClass) - value.plusEqual(matchHaplotypeScore(h)); - - return new PhasingScore(value); - } - - private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) { - PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0); - - int sz = this.bases.length; - if (sz != hap.bases.length) - throw new ReviewedStingException("Read and Haplotype should have same length to be compared!"); - - // Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness: - score.timesEqual(mappingProb); - - for (int i = 0; i < sz; i++) { - Byte thisBase = this.getBase(i); - Byte hapBase = hap.getBase(i); - if (thisBase != null && hapBase != null) { - if (BaseUtils.basesAreEqual(thisBase, hapBase)) - score.timesEqual(baseProbs[i]); - else - score.timesEqual(baseErrorProbs[i]); - } - } - - return score; - } - - public int[] getNonNullIndices() { - List nonNull = new LinkedList(); - for (int i = 0; i < bases.length; i++) { - if (getBase(i) != null) - nonNull.add(i); - } - - int[] nonNullArray = new int[nonNull.size()]; - int index = 0; - for (int i : nonNull) - nonNullArray[index++] = i; - return nonNullArray; - } -} - class PhasingStats { private int numReads; private int numVarSites; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java new file mode 100644 index 000000000..ae15c3f12 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.phasing; + +public class ReadBase { + public String readName; + public byte base; + public int mappingQual; + public byte baseQual; + + public ReadBase(String readName, byte base, int mappingQual, byte baseQual) { + this.readName = readName; + this.base = base; + this.mappingQual = mappingQual; + this.baseQual = baseQual; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java new file mode 100644 index 000000000..e5652c56e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.Iterator; +import java.util.LinkedList; + +public class ReadBasesAtPosition implements Iterable { + // list of: + private LinkedList bases; + + public ReadBasesAtPosition() { + this.bases = new LinkedList(); + } + + public void putReadBase(PileupElement pue) { + ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual()); + bases.add(rb); + } + + public Iterator iterator() { + return bases.iterator(); + } + + public boolean isEmpty() { + return bases.isEmpty(); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java new file mode 100755 index 000000000..6227a1f8a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBasedPhasingValidationWalker.java @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.phasing; + +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.phasing.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.*; +import java.util.*; + +/** + * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. + */ +@Allows(value = {DataSource.READS, DataSource.REFERENCE}) +@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@By(DataSource.READS) + +@ReadFilters({ZeroMappingQualityReadFilter.class}) +// Filter out all reads with zero mapping quality + +public class ReadBasedPhasingValidationWalker extends RodWalker { + private LinkedList rodNames = null; + + @Argument(fullName = "sitePairsFile", shortName = "sitePairsFile", doc = "File of pairs of variants for which phasing in ROD should be assessed using input reads", required = true) + protected File sitePairsFile = null; + + @Output + protected PrintStream out; + + private Set sitePairs = null; + private String sampleName = null; + + SiteGenotypeAndReads prevSiteAndReads = null; + + private final static int NUM_IN_PAIR = 2; // trivial + + // enable deletions in the pileup + public boolean includeReadsWithDeletionAtLoci() { return true; } + + public void initialize() { + rodNames = new LinkedList(); + rodNames.add("variant"); + + sitePairs = new TreeSet(); + GenomeLocParser locParser = getToolkit().getGenomeLocParser(); + + InputStream sitePairsStream = null; + try { + sitePairsStream = new FileInputStream(sitePairsFile); + } + catch (FileNotFoundException fnfe) { + fnfe.printStackTrace(); + throw new UserException("Problem opening file: " + sitePairsFile); + } + + AsciiLineReader sitePairsReader = new AsciiLineReader(sitePairsStream); + while (true) { + String line = null; + try { + line = sitePairsReader.readLine(); + } + catch (IOException ioe) { + ioe.printStackTrace(); + throw new UserException("Problem reading file: " + sitePairsFile); + } + if (line == null) + break; // reached end of file + + String[] twoSites = line.split("\t"); + if (twoSites.length != 2) + throw new UserException("Must have PAIRS of sites in line " + line + " of " + sitePairsFile); + + SitePair sp = new SitePair(locParser.parseGenomeLoc(twoSites[0]), locParser.parseGenomeLoc(twoSites[1])); + sitePairs.add(sp); + } + } + + public boolean generateExtendedEvents() { + return false; + } + + public Integer reduceInit() { + return 0; + } + + /** + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + boolean relevantSitePair = false; + SitePair sp = null; + if (prevSiteAndReads != null) { + // all vc's below start at ref.getLocus() [due to requireStartHere = true]: + sp = new SitePair(prevSiteAndReads.site, ref.getLocus()); + relevantSitePair = sitePairs.contains(sp); + } + + if (context == null || !context.hasBasePileup()) + return null; + ReadBackedPileup pileup = context.getBasePileup(); + String nextName = null; + + // + // TODO: ASSUMES THAT ALL READS COME FROM A SINGLE SAMPLE: + // TODO: CODE SHOULD BE AS FOLLOWS: + /* + Collection sampNames = pileup.getSampleNames(); + if (sampNames.size() != 1) + throw new UserException("Reads must be for exactly one sample [not multi-sample]"); + nextName = sampNames.iterator().next(); + if (nextName == null) + throw new UserException("Reads must be for exactly one sample"); + + if (sampleName == null) + sampleName = nextName; + else if (!nextName.equals(sampleName)) + throw new UserException("Reads must have a single consistent sample name"); + + pileup = pileup.getPileupForSampleName(sampleName); + */ + // + + ReadBasesAtPosition readBases = new ReadBasesAtPosition(); + for (PileupElement p : pileup) + readBases.putReadBase(p); + + ReadCounter rdCounts = null; + if (relevantSitePair) { // otherwise, processed the reads for their possible use in the future: + PhasingReadList buildReads = new PhasingReadList(NUM_IN_PAIR); + buildReads.updateBases(0, prevSiteAndReads.readBases); + buildReads.updateBases(1, readBases); + + List reads = new LinkedList(); + for (PhasingRead rd : buildReads) { + if (rd.getNonNullIndices().length == NUM_IN_PAIR) // only want reads with BOTH bases called [possibly as deleted ("D")] + reads.add(rd); + } + + // Count the occurence of each "haplotype": + rdCounts = new ReadCounter(); + for (PhasingRead rd : reads) + rdCounts.incrementCount(rd); + } + + + // Now, read the ROD and note the genotypes and their phase to be validated: + Set calledHaplotypes = null; + List allPossibleHaplotypes = null; + + boolean requireStartHere = true; // only see each VariantContext once + boolean takeFirstOnly = true; // take only the first entry from the ROD file + for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { + if (vc.isFiltered() || !vc.isSNP()) + continue; + + if (vc.getNSamples() != 1) + throw new UserException("ROD file must have exactly one sample [not multi-sample]"); + nextName = vc.getSampleNames().iterator().next(); + if (sampleName == null) + sampleName = nextName; + else if (!nextName.equals(sampleName)) + throw new UserException("ROD must have a single consistent sample name"); + + Genotype gt = vc.getGenotype(sampleName); + + if (relevantSitePair) { + Genotype prevGt = prevSiteAndReads.gt; + List prevAlleles = prevGt.getAlleles(); + List curAlleles = gt.getAlleles(); + + calledHaplotypes = new TreeSet(); // implemented Haplotype.compareTo() + if (gt.isPhased()) { + if (gt.getPloidy() != prevGt.getPloidy()) + throw new UserException("Invalid ROD file: cannot be phased AND have different ploidys!"); + + // Consider only the haplotypes called to be phased + Iterator curAllIt = curAlleles.iterator(); + for (Allele prevAll : prevAlleles) { + Allele curAll = curAllIt.next(); + calledHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll)); + } + } + + // Consider EVERY combination of alleles as haplotypes [IF PHASED, this will give the contingency table in the CORRECT order]: + allPossibleHaplotypes = new LinkedList(); + for (Allele prevAll : prevAlleles) { + for (Allele curAll : curAlleles) { + allPossibleHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll)); + } + } + } + + prevSiteAndReads = new SiteGenotypeAndReads(ref.getLocus(), gt, readBases); + } + + int processedPairs = 0; + if (relevantSitePair) { + Map haplotypeCounts = new TreeMap(); // implemented Haplotype.compareTo() + + processedPairs = 1; + int totalCount = rdCounts.totalCount(); + System.out.println("\nPair: " + sp + " [# reads = " + totalCount + "]"); + + int matchCount = 0; + for (Map.Entry rdEntry : rdCounts.entrySet()) { + PhasingRead read = rdEntry.getKey(); + int count = rdEntry.getValue(); + + Haplotype readsHaplotype = new Haplotype(read); + haplotypeCounts.put(readsHaplotype, count); + + boolean readMatchesCalledHaplotype = calledHaplotypes != null && calledHaplotypes.contains(readsHaplotype); + if (readMatchesCalledHaplotype) + matchCount += count; + + System.out.println("read" + ": " + read + (readMatchesCalledHaplotype ? "*" : "") + "\tcount: " + count); + } + + double percentMatchingReads = 100 * (matchCount / (double) totalCount); + System.out.println("% MATCHING reads: " + percentMatchingReads + " [of " + totalCount + " TOTAL reads]"); + + out.print(sp); + for (Haplotype hap : allPossibleHaplotypes) { + Integer count = haplotypeCounts.get(hap); + if (count == null) // haplotype may not have been observed in ANY reads + count = 0; + + out.print("\t" + count); + } + out.println(); + } + + return processedPairs; + } + + private Haplotype successiveAllelesToHaplotype(Allele prevAll, Allele curAll) { + byte prevBase = SNPallelePair.getSingleBase(prevAll); + byte curBase = SNPallelePair.getSingleBase(curAll); + + byte[] hapBases = new byte[NUM_IN_PAIR]; + hapBases[0] = prevBase; + hapBases[1] = curBase; + return new Haplotype(hapBases); + } + + public Integer reduce(Integer addIn, Integer runningCount) { + if (addIn == null) + addIn = 0; + + return runningCount + addIn; + } + + /** + * @param result the number of reads and VariantContexts seen. + */ + public void onTraversalDone(Integer result) { + System.out.println("Validated " + result + " pairs of sites."); + } +} + +class SitePair implements Comparable { + public GenomeLoc site1; + public GenomeLoc site2; + + public SitePair(GenomeLoc site1, GenomeLoc site2) { + if (site1.size() > 1 || site2.size() > 1) + throw new UserException("Must give pairs of SINGLE-LOCUS record start sites"); + + this.site1 = site1; + this.site2 = site2; + } + + public String toString() { + return site1.toString() + "\t" + site2.toString(); + } + + public int compareTo(SitePair other) { + int comp1 = site1.compareTo(other.site1); + if (comp1 != 0) + return comp1; + + return site2.compareTo(other.site2); + } +} + +class SiteGenotypeAndReads { + public GenomeLoc site; + public Genotype gt; + public ReadBasesAtPosition readBases; + + public SiteGenotypeAndReads(GenomeLoc site, Genotype gt, ReadBasesAtPosition readBases) { + this.site = site; + this.gt = gt; + this.readBases = readBases; + } +} + +class PhasingReadList implements Iterable { + private Map readsAtSites = null; + private int numSites; + + public PhasingReadList(int numSites) { + this.readsAtSites = new HashMap(); + this.numSites = numSites; + } + + public void updateBases(int index, ReadBasesAtPosition readBases) { + if (readBases == null) + return; + + for (ReadBase rb : readBases) { + String readName = rb.readName; + + PhasingRead rd = readsAtSites.get(readName); + if (rd == null) { + rd = new PhasingRead(numSites, rb.mappingQual); + readsAtSites.put(readName, rd); + } + + // Arbitrarily updates to the last base observed for this sample and read (rb.base): + rd.updateBaseAndQuality(index, rb.base, rb.baseQual); + } + } + + public Iterator iterator() { + return readsAtSites.values().iterator(); + } + + public int size() { + return readsAtSites.size(); + } +} + +class ReadCounter { + private Map counts; + private int totalCount; + + public ReadCounter() { + this.counts = new TreeMap(); // implemented PhasingRead.compareTo() + } + + public void incrementCount(PhasingRead rd) { + Integer cnt = counts.get(rd); + if (cnt == null) + cnt = 0; + + counts.put(rd, cnt + 1); + totalCount++; + } + + public Set> entrySet() { + return counts.entrySet(); + } + + public int totalCount() { + return totalCount; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 8d679744b..b08af58aa 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -93,6 +93,10 @@ public class BaseUtils { return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); } + static public boolean extendedBasesAreEqual(byte base1, byte base2) { + return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); + } + /** * Converts a IUPAC nucleotide code to a pair of bases