First implementation of the Error Model.

Added stratification by lane to ReadBackedPileup.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6046 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
droazen 2011-06-22 22:54:33 +00:00
parent 27b1418b84
commit 2e3d6754cd
6 changed files with 327 additions and 6 deletions

View File

@ -1,5 +1,8 @@
package org.broadinstitute.sting.playground.gatk.walkers.replication_validation;
import com.google.common.collect.ArrayListMultimap;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -8,8 +11,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.MathUtils;
import java.io.PrintStream;
import org.apache.commons.math.util.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
import java.util.regex.Pattern;
/**
* Implementation of the replication and validation framework with reference based error model
@ -24,20 +33,207 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
@Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true)
String referenceSampleName = "NA12878";
@Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true)
int nSamples = -1;
@Argument(shortName="nchr", fullName="number_of_chromosomes", doc="Number of chromosomes per sample (in case you're not dealing with diploids). Default: 2.", required=false)
int nChromosomes = 2;
@Argument(shortName="maxac", fullName="max_allele_count", doc="Max number of alleles expected in a site. Default: 2 * number of samples. Smaller numbers process faster.", required=false)
int overrideMaxAlleleCount = -1;
@Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Default: Q40. Smaller numbers process faster.", required=false)
int maxQualityScore= 40;
@Argument(shortName="prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false)
byte defaultPrior= 20;
@Output(doc="Write output to this file instead of STDOUT")
PrintStream out;
public void initialize() {
return;
int maxAlleleCount;
List<LaneData> lanes;
private class LaneData {
String id;
List <String> sampleNames;
String referenceSample;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getSampleNames() {
return sampleNames;
}
public void setSampleNames(List<String> sampleNames) {
this.sampleNames = sampleNames;
}
public String getReferenceSample() {
return referenceSample;
}
public void setReferenceSample(String referenceSample) {
this.referenceSample = referenceSample;
}
}
/**
* Calculates he probability of the data (reference sample reads) given the phred scaled site quality score.
* @param data - array of bases from the reference sample
* @param refBase - base from the reference sequence for this site
* @param phredScaledPrior - phred scaled expected site quality (prior)
* @return an array of log10 probabilities of site qualities ranging from Q1-Q40.
*/
private double[] buildErrorModel (byte[] data, byte refBase, byte phredScaledPrior) {
double [] model = new double[maxQualityScore+1];
int coverage = data.length;
int mismatches = getNumberOfMismatches(data, refBase);
int matches = coverage - mismatches;
for (byte q=0; q<=maxQualityScore; q++) {
double probMismatch = MathUtils.phredScaleToProbability(q);
model[q] = MathUtils.phredScaleToLog10Probability(phredScaledPrior) +
org.apache.commons.math.util.MathUtils.binomialCoefficientLog(coverage, mismatches) +
mismatches * Math.log10(probMismatch) +
matches * Math.log10(1-probMismatch);
}
return model;
}
// buildErrorModel helper functions
private int getNumberOfMismatches(byte[] data, byte refBase) {
int mismatches = 0;
for (byte seqBase : data) {
if (seqBase != refBase)
mismatches++;
}
return mismatches;
}
// private String getLaneId (SAMReadGroupRecord rg) {
// String [] idParts = rg.getPlatformUnit().split("\\.");
// return idParts[0] + idParts[1]; // 0 is run_id, 1 is lane_number.
// }
//
// private List<LaneData> createLanesList () {
// List<SAMReadGroupRecord> rgs = getToolkit().getSAMFileHeader().getReadGroups();
// for(SAMReadGroupRecord rg : rgs) {
// String laneId = getLaneId(rg);
// }
//
// return new LinkedList<LaneData>();
// }
//
// private int getNumberOfSamples() {
// int nSamples = 0;
// List<Set<String>> readers = getToolkit().getSamplesByReaders();
// for(Set<String> reader : readers) {
// nSamples += reader.size();
// if (reader.contains(referenceSampleName))
// nSamples--;
// }
// return nSamples;
// }
public void initialize() {
if (overrideMaxAlleleCount > 0)
maxAlleleCount = overrideMaxAlleleCount;
else
maxAlleleCount = nSamples*nChromosomes;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// todo -- Locate the reference samples for each lane
// todo -- Locate each pool and assign a reference sample to it
ReadBackedPileup contextPileup = context.getBasePileup();
Set<String> lanesInLocus = getLaneIDs(contextPileup.getReadGroups());
for (String laneID : lanesInLocus) {
// make a pileup for this lane
ReadBackedPileup lanePileup = contextPileup.getPileupForLane(laneID);
// make a pileup without the reference sample
HashSet<String> samples = new HashSet<String>(lanePileup.getSampleNames());
samples.remove(referenceSampleName);
ReadBackedPileup poolPileup = lanePileup.getPileupForSampleNames(samples);
//System.out.println(ref.getLocus().getContig() + ":" + ref.getLocus().getStart());
// create a reference pileup to build error model
ReadBackedPileup referenceSamplePileup = lanePileup.getPileupForSampleName(referenceSampleName);
// We can only analyze loci where we have reference sample coverage. And it only makes sense to build
// an error model if the site has reads in the pileup (other than the reference sample)
if (referenceSamplePileup != null && !referenceSamplePileup.isEmpty() && !poolPileup.isEmpty()) {
// Build error model
double [] errorModel = buildErrorModel(poolPileup.getBases(), referenceSamplePileup.getBases()[0], defaultPrior);
// Debug error model
System.out.println(laneID + ":");
for (double v : errorModel)
System.out.print(v + ", ");
System.out.println();
// todo: call each pool for this site
// todo: merge pools
// todo: decide whether or not it's a variant
}
}
return 1;
}
public Long reduceInit() { return 0l; }
/**
* GATK Engine creates readgroups of the form XXX.Y.Z
* XXX.Y is the unique lane identifier.
* Z is the id of the sample to make the read group id unique
* This function returns a map that groups together all samples that were run in the same lane:
* XXX.Y -> {Z1, Z2, Z3, ... Zn}
* @param readGroups A collection of read group strings (obtained from the alignment context pileup)
* @return A map of lanes and samples, as described above
*/
// private HashMap<String, ArrayList<String>> decodeReadIDs(Collection<String> readGroups) {
// HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
// for (String rgid : readGroups) {
// String [] parsedId = rgid.split("\\.");
// String lane = parsedId[0] + "." + parsedId[1];
// String id = parsedId[2];
// ArrayList<String> idList = (result.get(lane) == null) ? (new ArrayList<String>()) : result.get(lane);
// idList.add(id);
// result.put(lane, idList);
// }
// return result;
// }
/**
* GATK Engine creates readgroups of the form XXX.Y.Z
* XXX.Y is the unique lane identifier.
* Z is the id of the sample to make the read group id unique
* This function returns a list of unique lane identifiers.
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
* @return a collection of lane ids.
*/
private Set<String> getLaneIDs(Collection<String> readGroups) {
HashSet<String> result = new HashSet<String>();
for (String rgid : readGroups) {
String [] parsedId = rgid.split("\\.");
result.add(parsedId[0] + "." + parsedId[1]);
}
return result;
}
public Long reduceInit() {
return 0l;
}
public Long reduce(Integer value, Long sum) {
return value + sum;
@ -55,3 +251,4 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
out.println(c);
}
}

View File

@ -608,6 +608,14 @@ public class MathUtils {
return average(vals, vals.size());
}
public static byte average(byte[] vals) {
int sum = 0;
for (byte v : vals) {
sum += v;
}
return (byte) Math.floor(sum/vals.length);
}
public static double averageDouble(List<Double> vals) {
return averageDouble(vals, vals.size());
}
@ -1048,4 +1056,19 @@ public class MathUtils {
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
static public double phredScaleToProbability (byte q) {
return Math.pow(10,(-q)/10.0);
}
static public double phredScaleToLog10Probability (byte q) {
return ((-q)/10.0);
}
static public byte probabilityToPhredScale (double p) {
return (byte) ((-10) * Math.log10(p));
}
static public double log10ProbabilityToPhredScale (double log10p) {
return (-10) * log10p;
}
}

View File

@ -520,6 +520,39 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
}
}
@Override
public RBP getPileupForLane(String laneID) {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
for(Sample sample: tracker.getSamples()) {
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup<RBP,PE> pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID);
if(pileup != null)
filteredTracker.addElements(sample,pileup.pileupElementTracker);
}
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
}
else {
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
for(PE p: pileupElementTracker) {
SAMRecord read = p.getRead();
if(laneID != null) {
if(read.getReadGroup() != null &&
(read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different
(read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same
filteredTracker.add(p);
}
else {
if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null)
filteredTracker.add(p);
}
}
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
}
}
public Collection<String> getSampleNames() {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
@ -608,6 +641,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
}
}
@Override
public RBP getPileupForSampleNames(Collection<String> sampleNames) {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
PileupElementTracker<PE> filteredElements = tracker.getElements(sampleNames);
return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null;
}
else {
HashSet<String> hashSampleNames = new HashSet<String>(sampleNames); // to speed up the "contains" access in the for loop
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
for(PE p: pileupElementTracker) {
SAMRecord read = p.getRead();
if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else.
if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample()))
filteredTracker.add(p);
}
else {
if(read.getReadGroup() == null || read.getReadGroup().getSample() == null)
filteredTracker.add(p);
}
}
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
}
}
@Override
public RBP getPileupForSampleName(String sampleName) {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
@ -716,6 +775,15 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
return size;
}
/**
* @return true if there are 0 elements in the pileup, false otherwise
*/
@Override
public boolean isEmpty() {
return size==0;
}
/**
* @return the location of this pileup
*/

View File

@ -41,7 +41,7 @@ class MergingPileupElementIterator<PE extends PileupElement> implements Iterator
private final PriorityQueue<PeekableIterator<PE>> perSampleIterators;
public MergingPileupElementIterator(PerSamplePileupElementTracker<PE> tracker) {
perSampleIterators = new PriorityQueue<PeekableIterator<PE>>(tracker.getSamples().size(),new PileupElementIteratorComparator());
perSampleIterators = new PriorityQueue<PeekableIterator<PE>>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator());
for(Sample sample: tracker.getSamples()) {
PileupElementTracker<PE> trackerPerSample = tracker.getElements(sample);
if(trackerPerSample.size() != 0)

View File

@ -94,6 +94,15 @@ class PerSamplePileupElementTracker<PE extends PileupElement> extends PileupElem
return pileup.get(sampleNames.get(sampleName));
}
public PileupElementTracker<PE> getElements(final Collection<String> selectSampleNames) {
PerSamplePileupElementTracker<PE> result = new PerSamplePileupElementTracker<PE>();
for (String sample : selectSampleNames) {
Sample sampleObject = sampleNames.get(sample);
result.addElements(sampleObject, pileup.get(sampleObject));
}
return result;
}
public void addElements(final Sample sample, PileupElementTracker<PE> elements) {
pileup.put(sample,elements);
sampleNames.put(sample.getId(), sample);

View File

@ -130,6 +130,15 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public ReadBackedPileup getPileupForReadGroup(String readGroupId);
/**
* Gets all reads in a given lane id. (Lane ID is the read group
* id stripped of the last .XX sample identifier added by the GATK).
* @param laneID The read group ID without the sample identifier added by the GATK.
* @return A pileup containing the reads from all samples in the given lane.
*/
public ReadBackedPileup getPileupForLane(String laneID);
/**
* Gets a collection of all the samples stored in this pileup.
* @return Collection of samples in this pileup.
@ -142,6 +151,15 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public Collection<String> getSampleNames();
/**
* Gets the particular subset of this pileup for all the given sample names.
* @param sampleNames Name of the sample to use.
* @return A subset of this pileup containing only reads with the given sample.
*/
public ReadBackedPileup getPileupForSampleNames(Collection<String> sampleNames);
/**
* Gets the particular subset of this pileup with the given sample name.
* @param sampleName Name of the sample to use.
@ -170,6 +188,11 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public int size();
/**
* @return true if there are 0 elements in the pileup, false otherwise
*/
public boolean isEmpty();
/**
* @return the location of this pileup
*/
@ -214,4 +237,5 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
* @return
*/
public byte[] getMappingQuals();
}