First implementation of the Error Model.
Added stratification by lane to ReadBackedPileup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6046 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
27b1418b84
commit
2e3d6754cd
|
|
@ -1,5 +1,8 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.replication_validation;
|
||||
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -8,8 +11,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import org.apache.commons.math.util.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Implementation of the replication and validation framework with reference based error model
|
||||
|
|
@ -24,20 +33,207 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
|||
@Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true)
|
||||
String referenceSampleName = "NA12878";
|
||||
|
||||
@Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true)
|
||||
int nSamples = -1;
|
||||
|
||||
@Argument(shortName="nchr", fullName="number_of_chromosomes", doc="Number of chromosomes per sample (in case you're not dealing with diploids). Default: 2.", required=false)
|
||||
int nChromosomes = 2;
|
||||
|
||||
@Argument(shortName="maxac", fullName="max_allele_count", doc="Max number of alleles expected in a site. Default: 2 * number of samples. Smaller numbers process faster.", required=false)
|
||||
int overrideMaxAlleleCount = -1;
|
||||
|
||||
@Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Default: Q40. Smaller numbers process faster.", required=false)
|
||||
int maxQualityScore= 40;
|
||||
|
||||
@Argument(shortName="prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false)
|
||||
byte defaultPrior= 20;
|
||||
|
||||
@Output(doc="Write output to this file instead of STDOUT")
|
||||
PrintStream out;
|
||||
|
||||
public void initialize() {
|
||||
return;
|
||||
int maxAlleleCount;
|
||||
List<LaneData> lanes;
|
||||
|
||||
private class LaneData {
|
||||
String id;
|
||||
List <String> sampleNames;
|
||||
String referenceSample;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<String> getSampleNames() {
|
||||
return sampleNames;
|
||||
}
|
||||
|
||||
public void setSampleNames(List<String> sampleNames) {
|
||||
this.sampleNames = sampleNames;
|
||||
}
|
||||
|
||||
public String getReferenceSample() {
|
||||
return referenceSample;
|
||||
}
|
||||
|
||||
public void setReferenceSample(String referenceSample) {
|
||||
this.referenceSample = referenceSample;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates he probability of the data (reference sample reads) given the phred scaled site quality score.
|
||||
* @param data - array of bases from the reference sample
|
||||
* @param refBase - base from the reference sequence for this site
|
||||
* @param phredScaledPrior - phred scaled expected site quality (prior)
|
||||
* @return an array of log10 probabilities of site qualities ranging from Q1-Q40.
|
||||
*/
|
||||
private double[] buildErrorModel (byte[] data, byte refBase, byte phredScaledPrior) {
|
||||
double [] model = new double[maxQualityScore+1];
|
||||
int coverage = data.length;
|
||||
int mismatches = getNumberOfMismatches(data, refBase);
|
||||
int matches = coverage - mismatches;
|
||||
|
||||
for (byte q=0; q<=maxQualityScore; q++) {
|
||||
double probMismatch = MathUtils.phredScaleToProbability(q);
|
||||
model[q] = MathUtils.phredScaleToLog10Probability(phredScaledPrior) +
|
||||
org.apache.commons.math.util.MathUtils.binomialCoefficientLog(coverage, mismatches) +
|
||||
mismatches * Math.log10(probMismatch) +
|
||||
matches * Math.log10(1-probMismatch);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
// buildErrorModel helper functions
|
||||
private int getNumberOfMismatches(byte[] data, byte refBase) {
|
||||
int mismatches = 0;
|
||||
for (byte seqBase : data) {
|
||||
if (seqBase != refBase)
|
||||
mismatches++;
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
// private String getLaneId (SAMReadGroupRecord rg) {
|
||||
// String [] idParts = rg.getPlatformUnit().split("\\.");
|
||||
// return idParts[0] + idParts[1]; // 0 is run_id, 1 is lane_number.
|
||||
// }
|
||||
//
|
||||
// private List<LaneData> createLanesList () {
|
||||
// List<SAMReadGroupRecord> rgs = getToolkit().getSAMFileHeader().getReadGroups();
|
||||
// for(SAMReadGroupRecord rg : rgs) {
|
||||
// String laneId = getLaneId(rg);
|
||||
// }
|
||||
//
|
||||
// return new LinkedList<LaneData>();
|
||||
// }
|
||||
//
|
||||
// private int getNumberOfSamples() {
|
||||
// int nSamples = 0;
|
||||
// List<Set<String>> readers = getToolkit().getSamplesByReaders();
|
||||
// for(Set<String> reader : readers) {
|
||||
// nSamples += reader.size();
|
||||
// if (reader.contains(referenceSampleName))
|
||||
// nSamples--;
|
||||
// }
|
||||
// return nSamples;
|
||||
// }
|
||||
|
||||
|
||||
public void initialize() {
|
||||
if (overrideMaxAlleleCount > 0)
|
||||
maxAlleleCount = overrideMaxAlleleCount;
|
||||
else
|
||||
maxAlleleCount = nSamples*nChromosomes;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// todo -- Locate the reference samples for each lane
|
||||
// todo -- Locate each pool and assign a reference sample to it
|
||||
ReadBackedPileup contextPileup = context.getBasePileup();
|
||||
Set<String> lanesInLocus = getLaneIDs(contextPileup.getReadGroups());
|
||||
for (String laneID : lanesInLocus) {
|
||||
// make a pileup for this lane
|
||||
ReadBackedPileup lanePileup = contextPileup.getPileupForLane(laneID);
|
||||
|
||||
// make a pileup without the reference sample
|
||||
HashSet<String> samples = new HashSet<String>(lanePileup.getSampleNames());
|
||||
samples.remove(referenceSampleName);
|
||||
|
||||
ReadBackedPileup poolPileup = lanePileup.getPileupForSampleNames(samples);
|
||||
|
||||
//System.out.println(ref.getLocus().getContig() + ":" + ref.getLocus().getStart());
|
||||
|
||||
// create a reference pileup to build error model
|
||||
ReadBackedPileup referenceSamplePileup = lanePileup.getPileupForSampleName(referenceSampleName);
|
||||
|
||||
// We can only analyze loci where we have reference sample coverage. And it only makes sense to build
|
||||
// an error model if the site has reads in the pileup (other than the reference sample)
|
||||
if (referenceSamplePileup != null && !referenceSamplePileup.isEmpty() && !poolPileup.isEmpty()) {
|
||||
|
||||
// Build error model
|
||||
double [] errorModel = buildErrorModel(poolPileup.getBases(), referenceSamplePileup.getBases()[0], defaultPrior);
|
||||
|
||||
// Debug error model
|
||||
System.out.println(laneID + ":");
|
||||
for (double v : errorModel)
|
||||
System.out.print(v + ", ");
|
||||
System.out.println();
|
||||
|
||||
// todo: call each pool for this site
|
||||
// todo: merge pools
|
||||
// todo: decide whether or not it's a variant
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Long reduceInit() { return 0l; }
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
* This function returns a map that groups together all samples that were run in the same lane:
|
||||
* XXX.Y -> {Z1, Z2, Z3, ... Zn}
|
||||
* @param readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||
* @return A map of lanes and samples, as described above
|
||||
*/
|
||||
// private HashMap<String, ArrayList<String>> decodeReadIDs(Collection<String> readGroups) {
|
||||
// HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
|
||||
// for (String rgid : readGroups) {
|
||||
// String [] parsedId = rgid.split("\\.");
|
||||
// String lane = parsedId[0] + "." + parsedId[1];
|
||||
// String id = parsedId[2];
|
||||
// ArrayList<String> idList = (result.get(lane) == null) ? (new ArrayList<String>()) : result.get(lane);
|
||||
// idList.add(id);
|
||||
// result.put(lane, idList);
|
||||
// }
|
||||
// return result;
|
||||
// }
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
* This function returns a list of unique lane identifiers.
|
||||
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||
* @return a collection of lane ids.
|
||||
*/
|
||||
private Set<String> getLaneIDs(Collection<String> readGroups) {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (String rgid : readGroups) {
|
||||
String [] parsedId = rgid.split("\\.");
|
||||
result.add(parsedId[0] + "." + parsedId[1]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
public Long reduce(Integer value, Long sum) {
|
||||
return value + sum;
|
||||
|
|
@ -55,3 +251,4 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
|||
out.println(c);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -608,6 +608,14 @@ public class MathUtils {
|
|||
return average(vals, vals.size());
|
||||
}
|
||||
|
||||
public static byte average(byte[] vals) {
|
||||
int sum = 0;
|
||||
for (byte v : vals) {
|
||||
sum += v;
|
||||
}
|
||||
return (byte) Math.floor(sum/vals.length);
|
||||
}
|
||||
|
||||
public static double averageDouble(List<Double> vals) {
|
||||
return averageDouble(vals, vals.size());
|
||||
}
|
||||
|
|
@ -1048,4 +1056,19 @@ public class MathUtils {
|
|||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
}
|
||||
|
||||
static public double phredScaleToProbability (byte q) {
|
||||
return Math.pow(10,(-q)/10.0);
|
||||
}
|
||||
|
||||
static public double phredScaleToLog10Probability (byte q) {
|
||||
return ((-q)/10.0);
|
||||
}
|
||||
|
||||
static public byte probabilityToPhredScale (double p) {
|
||||
return (byte) ((-10) * Math.log10(p));
|
||||
}
|
||||
|
||||
static public double log10ProbabilityToPhredScale (double log10p) {
|
||||
return (-10) * log10p;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -520,6 +520,39 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public RBP getPileupForLane(String laneID) {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
|
||||
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
|
||||
|
||||
for(Sample sample: tracker.getSamples()) {
|
||||
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
|
||||
AbstractReadBackedPileup<RBP,PE> pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID);
|
||||
if(pileup != null)
|
||||
filteredTracker.addElements(sample,pileup.pileupElementTracker);
|
||||
}
|
||||
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
|
||||
}
|
||||
else {
|
||||
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
|
||||
for(PE p: pileupElementTracker) {
|
||||
SAMRecord read = p.getRead();
|
||||
if(laneID != null) {
|
||||
if(read.getReadGroup() != null &&
|
||||
(read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different
|
||||
(read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
else {
|
||||
if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null)
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
}
|
||||
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
|
||||
}
|
||||
}
|
||||
|
||||
public Collection<String> getSampleNames() {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
|
||||
|
|
@ -608,6 +641,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public RBP getPileupForSampleNames(Collection<String> sampleNames) {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
|
||||
PileupElementTracker<PE> filteredElements = tracker.getElements(sampleNames);
|
||||
return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null;
|
||||
}
|
||||
else {
|
||||
HashSet<String> hashSampleNames = new HashSet<String>(sampleNames); // to speed up the "contains" access in the for loop
|
||||
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
|
||||
for(PE p: pileupElementTracker) {
|
||||
SAMRecord read = p.getRead();
|
||||
if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else.
|
||||
if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample()))
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
else {
|
||||
if(read.getReadGroup() == null || read.getReadGroup().getSample() == null)
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
}
|
||||
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RBP getPileupForSampleName(String sampleName) {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
|
|
@ -716,6 +775,15 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if there are 0 elements in the pileup, false otherwise
|
||||
*/
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return size==0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the location of this pileup
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ class MergingPileupElementIterator<PE extends PileupElement> implements Iterator
|
|||
private final PriorityQueue<PeekableIterator<PE>> perSampleIterators;
|
||||
|
||||
public MergingPileupElementIterator(PerSamplePileupElementTracker<PE> tracker) {
|
||||
perSampleIterators = new PriorityQueue<PeekableIterator<PE>>(tracker.getSamples().size(),new PileupElementIteratorComparator());
|
||||
perSampleIterators = new PriorityQueue<PeekableIterator<PE>>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator());
|
||||
for(Sample sample: tracker.getSamples()) {
|
||||
PileupElementTracker<PE> trackerPerSample = tracker.getElements(sample);
|
||||
if(trackerPerSample.size() != 0)
|
||||
|
|
|
|||
|
|
@ -94,6 +94,15 @@ class PerSamplePileupElementTracker<PE extends PileupElement> extends PileupElem
|
|||
return pileup.get(sampleNames.get(sampleName));
|
||||
}
|
||||
|
||||
public PileupElementTracker<PE> getElements(final Collection<String> selectSampleNames) {
|
||||
PerSamplePileupElementTracker<PE> result = new PerSamplePileupElementTracker<PE>();
|
||||
for (String sample : selectSampleNames) {
|
||||
Sample sampleObject = sampleNames.get(sample);
|
||||
result.addElements(sampleObject, pileup.get(sampleObject));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public void addElements(final Sample sample, PileupElementTracker<PE> elements) {
|
||||
pileup.put(sample,elements);
|
||||
sampleNames.put(sample.getId(), sample);
|
||||
|
|
|
|||
|
|
@ -130,6 +130,15 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public ReadBackedPileup getPileupForReadGroup(String readGroupId);
|
||||
|
||||
/**
|
||||
* Gets all reads in a given lane id. (Lane ID is the read group
|
||||
* id stripped of the last .XX sample identifier added by the GATK).
|
||||
* @param laneID The read group ID without the sample identifier added by the GATK.
|
||||
* @return A pileup containing the reads from all samples in the given lane.
|
||||
*/
|
||||
public ReadBackedPileup getPileupForLane(String laneID);
|
||||
|
||||
|
||||
/**
|
||||
* Gets a collection of all the samples stored in this pileup.
|
||||
* @return Collection of samples in this pileup.
|
||||
|
|
@ -142,6 +151,15 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public Collection<String> getSampleNames();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the particular subset of this pileup for all the given sample names.
|
||||
* @param sampleNames Name of the sample to use.
|
||||
* @return A subset of this pileup containing only reads with the given sample.
|
||||
*/
|
||||
public ReadBackedPileup getPileupForSampleNames(Collection<String> sampleNames);
|
||||
|
||||
|
||||
/**
|
||||
* Gets the particular subset of this pileup with the given sample name.
|
||||
* @param sampleName Name of the sample to use.
|
||||
|
|
@ -170,6 +188,11 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public int size();
|
||||
|
||||
/**
|
||||
* @return true if there are 0 elements in the pileup, false otherwise
|
||||
*/
|
||||
public boolean isEmpty();
|
||||
|
||||
/**
|
||||
* @return the location of this pileup
|
||||
*/
|
||||
|
|
@ -214,4 +237,5 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
* @return
|
||||
*/
|
||||
public byte[] getMappingQuals();
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue