diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java index 55fc362f2..17763b059 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java @@ -1,5 +1,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.replication_validation; +import com.google.common.collect.ArrayListMultimap; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -8,8 +11,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.MathUtils; import java.io.PrintStream; +import org.apache.commons.math.util.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; +import java.util.regex.Pattern; /** * Implementation of the replication and validation framework with reference based error model @@ -24,20 +33,207 @@ public class ReplicationValidationWalker extends LocusWalker impl @Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true) String referenceSampleName = "NA12878"; + @Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true) + int nSamples = -1; + + @Argument(shortName="nchr", fullName="number_of_chromosomes", doc="Number of chromosomes per sample (in case you're not dealing with diploids). Default: 2.", required=false) + int nChromosomes = 2; + + @Argument(shortName="maxac", fullName="max_allele_count", doc="Max number of alleles expected in a site. Default: 2 * number of samples. Smaller numbers process faster.", required=false) + int overrideMaxAlleleCount = -1; + + @Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Default: Q40. Smaller numbers process faster.", required=false) + int maxQualityScore= 40; + + @Argument(shortName="prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false) + byte defaultPrior= 20; + @Output(doc="Write output to this file instead of STDOUT") PrintStream out; - public void initialize() { - return; + int maxAlleleCount; + List lanes; + + private class LaneData { + String id; + List sampleNames; + String referenceSample; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getSampleNames() { + return sampleNames; + } + + public void setSampleNames(List sampleNames) { + this.sampleNames = sampleNames; + } + + public String getReferenceSample() { + return referenceSample; + } + + public void setReferenceSample(String referenceSample) { + this.referenceSample = referenceSample; + } } + /** + * Calculates he probability of the data (reference sample reads) given the phred scaled site quality score. + * @param data - array of bases from the reference sample + * @param refBase - base from the reference sequence for this site + * @param phredScaledPrior - phred scaled expected site quality (prior) + * @return an array of log10 probabilities of site qualities ranging from Q1-Q40. + */ + private double[] buildErrorModel (byte[] data, byte refBase, byte phredScaledPrior) { + double [] model = new double[maxQualityScore+1]; + int coverage = data.length; + int mismatches = getNumberOfMismatches(data, refBase); + int matches = coverage - mismatches; + + for (byte q=0; q<=maxQualityScore; q++) { + double probMismatch = MathUtils.phredScaleToProbability(q); + model[q] = MathUtils.phredScaleToLog10Probability(phredScaledPrior) + + org.apache.commons.math.util.MathUtils.binomialCoefficientLog(coverage, mismatches) + + mismatches * Math.log10(probMismatch) + + matches * Math.log10(1-probMismatch); + } + return model; + } + + // buildErrorModel helper functions + private int getNumberOfMismatches(byte[] data, byte refBase) { + int mismatches = 0; + for (byte seqBase : data) { + if (seqBase != refBase) + mismatches++; + } + return mismatches; + } + +// private String getLaneId (SAMReadGroupRecord rg) { +// String [] idParts = rg.getPlatformUnit().split("\\."); +// return idParts[0] + idParts[1]; // 0 is run_id, 1 is lane_number. +// } +// +// private List createLanesList () { +// List rgs = getToolkit().getSAMFileHeader().getReadGroups(); +// for(SAMReadGroupRecord rg : rgs) { +// String laneId = getLaneId(rg); +// } +// +// return new LinkedList(); +// } +// +// private int getNumberOfSamples() { +// int nSamples = 0; +// List> readers = getToolkit().getSamplesByReaders(); +// for(Set reader : readers) { +// nSamples += reader.size(); +// if (reader.contains(referenceSampleName)) +// nSamples--; +// } +// return nSamples; +// } + + + public void initialize() { + if (overrideMaxAlleleCount > 0) + maxAlleleCount = overrideMaxAlleleCount; + else + maxAlleleCount = nSamples*nChromosomes; + } + + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - // todo -- Locate the reference samples for each lane - // todo -- Locate each pool and assign a reference sample to it + ReadBackedPileup contextPileup = context.getBasePileup(); + Set lanesInLocus = getLaneIDs(contextPileup.getReadGroups()); + for (String laneID : lanesInLocus) { + // make a pileup for this lane + ReadBackedPileup lanePileup = contextPileup.getPileupForLane(laneID); + + // make a pileup without the reference sample + HashSet samples = new HashSet(lanePileup.getSampleNames()); + samples.remove(referenceSampleName); + + ReadBackedPileup poolPileup = lanePileup.getPileupForSampleNames(samples); + + //System.out.println(ref.getLocus().getContig() + ":" + ref.getLocus().getStart()); + + // create a reference pileup to build error model + ReadBackedPileup referenceSamplePileup = lanePileup.getPileupForSampleName(referenceSampleName); + + // We can only analyze loci where we have reference sample coverage. And it only makes sense to build + // an error model if the site has reads in the pileup (other than the reference sample) + if (referenceSamplePileup != null && !referenceSamplePileup.isEmpty() && !poolPileup.isEmpty()) { + + // Build error model + double [] errorModel = buildErrorModel(poolPileup.getBases(), referenceSamplePileup.getBases()[0], defaultPrior); + + // Debug error model + System.out.println(laneID + ":"); + for (double v : errorModel) + System.out.print(v + ", "); + System.out.println(); + + // todo: call each pool for this site + // todo: merge pools + // todo: decide whether or not it's a variant + } + } return 1; } - public Long reduceInit() { return 0l; } + /** + * GATK Engine creates readgroups of the form XXX.Y.Z + * XXX.Y is the unique lane identifier. + * Z is the id of the sample to make the read group id unique + * This function returns a map that groups together all samples that were run in the same lane: + * XXX.Y -> {Z1, Z2, Z3, ... Zn} + * @param readGroups A collection of read group strings (obtained from the alignment context pileup) + * @return A map of lanes and samples, as described above + */ +// private HashMap> decodeReadIDs(Collection readGroups) { +// HashMap> result = new HashMap>(); +// for (String rgid : readGroups) { +// String [] parsedId = rgid.split("\\."); +// String lane = parsedId[0] + "." + parsedId[1]; +// String id = parsedId[2]; +// ArrayList idList = (result.get(lane) == null) ? (new ArrayList()) : result.get(lane); +// idList.add(id); +// result.put(lane, idList); +// } +// return result; +// } + + /** + * GATK Engine creates readgroups of the form XXX.Y.Z + * XXX.Y is the unique lane identifier. + * Z is the id of the sample to make the read group id unique + * This function returns a list of unique lane identifiers. + * @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup) + * @return a collection of lane ids. + */ + private Set getLaneIDs(Collection readGroups) { + HashSet result = new HashSet(); + for (String rgid : readGroups) { + String [] parsedId = rgid.split("\\."); + result.add(parsedId[0] + "." + parsedId[1]); + } + return result; + } + + + public Long reduceInit() { + return 0l; + } public Long reduce(Integer value, Long sum) { return value + sum; @@ -55,3 +251,4 @@ public class ReplicationValidationWalker extends LocusWalker impl out.println(c); } } + diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index d4501537e..88d186ad2 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -608,6 +608,14 @@ public class MathUtils { return average(vals, vals.size()); } + public static byte average(byte[] vals) { + int sum = 0; + for (byte v : vals) { + sum += v; + } + return (byte) Math.floor(sum/vals.length); + } + public static double averageDouble(List vals) { return averageDouble(vals, vals.size()); } @@ -1048,4 +1056,19 @@ public class MathUtils { // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); } + static public double phredScaleToProbability (byte q) { + return Math.pow(10,(-q)/10.0); + } + + static public double phredScaleToLog10Probability (byte q) { + return ((-q)/10.0); + } + + static public byte probabilityToPhredScale (double p) { + return (byte) ((-10) * Math.log10(p)); + } + + static public double log10ProbabilityToPhredScale (double log10p) { + return (-10) * log10p; + } } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 98f811229..543302446 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -520,6 +520,39 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for(Sample sample: tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); + if(pileup != null) + filteredTracker.addElements(sample,pileup.pileupElementTracker); + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for(PE p: pileupElementTracker) { + SAMRecord read = p.getRead(); + if(laneID != null) { + if(read.getReadGroup() != null && + (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different + (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same + filteredTracker.add(p); + } + else { + if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + } + public Collection getSampleNames() { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; @@ -608,6 +641,32 @@ public abstract class AbstractReadBackedPileup sampleNames) { + if(pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; + } + else { + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for(PE p: pileupElementTracker) { + SAMRecord read = p.getRead(); + if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) + filteredTracker.add(p); + } + else { + if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + } + + @Override public RBP getPileupForSampleName(String sampleName) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { @@ -716,6 +775,15 @@ public abstract class AbstractReadBackedPileup implements Iterator private final PriorityQueue> perSampleIterators; public MergingPileupElementIterator(PerSamplePileupElementTracker tracker) { - perSampleIterators = new PriorityQueue>(tracker.getSamples().size(),new PileupElementIteratorComparator()); + perSampleIterators = new PriorityQueue>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator()); for(Sample sample: tracker.getSamples()) { PileupElementTracker trackerPerSample = tracker.getElements(sample); if(trackerPerSample.size() != 0) diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java index 2f5d2672b..29e431695 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java @@ -94,6 +94,15 @@ class PerSamplePileupElementTracker extends PileupElem return pileup.get(sampleNames.get(sampleName)); } + public PileupElementTracker getElements(final Collection selectSampleNames) { + PerSamplePileupElementTracker result = new PerSamplePileupElementTracker(); + for (String sample : selectSampleNames) { + Sample sampleObject = sampleNames.get(sample); + result.addElements(sampleObject, pileup.get(sampleObject)); + } + return result; + } + public void addElements(final Sample sample, PileupElementTracker elements) { pileup.put(sample,elements); sampleNames.put(sample.getId(), sample); diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 2beb5b0d1..c52cc0b52 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -130,6 +130,15 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForReadGroup(String readGroupId); + /** + * Gets all reads in a given lane id. (Lane ID is the read group + * id stripped of the last .XX sample identifier added by the GATK). + * @param laneID The read group ID without the sample identifier added by the GATK. + * @return A pileup containing the reads from all samples in the given lane. + */ + public ReadBackedPileup getPileupForLane(String laneID); + + /** * Gets a collection of all the samples stored in this pileup. * @return Collection of samples in this pileup. @@ -142,6 +151,15 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public Collection getSampleNames(); + + /** + * Gets the particular subset of this pileup for all the given sample names. + * @param sampleNames Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. + */ + public ReadBackedPileup getPileupForSampleNames(Collection sampleNames); + + /** * Gets the particular subset of this pileup with the given sample name. * @param sampleName Name of the sample to use. @@ -170,6 +188,11 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public int size(); + /** + * @return true if there are 0 elements in the pileup, false otherwise + */ + public boolean isEmpty(); + /** * @return the location of this pileup */ @@ -214,4 +237,5 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @return */ public byte[] getMappingQuals(); + }