Added preliminary framework for performing short-range phasing (ReadBackedPhasingWalker.java)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3953 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a8d37da10b
commit
b21f90aee0
|
|
@ -0,0 +1,328 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broad.tribble.vcf.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads).
|
||||||
|
* Use '-BTI variant' to only stop at positions in the VCF file bound to 'variant'.
|
||||||
|
*/
|
||||||
|
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
|
||||||
|
public class ReadBackedPhasingWalker extends LocusWalker<Pair<VariantContextStats, List<VariantContext>>, VariantContextStats> {
|
||||||
|
|
||||||
|
@Argument(fullName="cacheWindowSize", shortName="cacheWindow", doc="The window size (in bases) to cache variant sites and their reads; [default:300]", required=false)
|
||||||
|
protected Integer cacheWindow = 300;
|
||||||
|
|
||||||
|
@Argument(fullName="phasedVCFFile", shortName="phasedVCF", doc="The name of the phased VCF file output", required=true)
|
||||||
|
protected String phasedVCFFile = null;
|
||||||
|
|
||||||
|
private VCFWriter writer = null;
|
||||||
|
|
||||||
|
private LinkedList<VariantAndAlignment> siteQueue = null;
|
||||||
|
|
||||||
|
private void initializeVcfWriter(VariantContext vc) {
|
||||||
|
// setup the header fields
|
||||||
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||||
|
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||||
|
|
||||||
|
writer = new VCFWriter(new File(phasedVCFFile));
|
||||||
|
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(vc.getSampleNames())));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
siteQueue = new LinkedList<VariantAndAlignment>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean generateExtendedEvents() { // want to see indels
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContextStats reduceInit() { return new VariantContextStats(); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each site of interest, cache the current site and then use the cache to phase all upstream sites
|
||||||
|
* for which "sufficient" information has already been observed.
|
||||||
|
*
|
||||||
|
* @param tracker the meta-data tracker
|
||||||
|
* @param ref the reference base
|
||||||
|
* @param context the context for the given locus
|
||||||
|
* @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range.
|
||||||
|
*/
|
||||||
|
public Pair<VariantContextStats, List<VariantContext>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
VariantContextStats vcStats = new VariantContextStats();
|
||||||
|
List<VariantContext> phasedList = new LinkedList<VariantContext>();
|
||||||
|
if ( tracker == null )
|
||||||
|
return new Pair<VariantContextStats, List<VariantContext>>(vcStats, phasedList);
|
||||||
|
|
||||||
|
List<Object> rods = tracker.getReferenceMetaData("variant");
|
||||||
|
ListIterator<Object> rodIt = rods.listIterator();
|
||||||
|
while (rodIt.hasNext()) {
|
||||||
|
VariantContext vc = VariantContextAdaptors.toVariantContext("variant", rodIt.next(), ref);
|
||||||
|
if (vc.getType() == VariantContext.Type.MNP) {
|
||||||
|
throw new StingException("Doesn't support phasing for multiple-nucleotide polymorphism!");
|
||||||
|
}
|
||||||
|
VariantAndAlignment va = new VariantAndAlignment(vc, context);
|
||||||
|
siteQueue.add(va);
|
||||||
|
|
||||||
|
int numReads = 0;
|
||||||
|
if (context.hasBasePileup()) {
|
||||||
|
numReads = context.getBasePileup().size();
|
||||||
|
}
|
||||||
|
else if (context.hasExtendedEventPileup()) {
|
||||||
|
numReads = context.getExtendedEventPileup().size();
|
||||||
|
}
|
||||||
|
VariantContextStats addInVcStats = new VariantContextStats(numReads, 1);
|
||||||
|
vcStats.addTo(addInVcStats);
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc refLoc = ref.getLocus();
|
||||||
|
while (!siteQueue.isEmpty()) {
|
||||||
|
VariantContext vc = siteQueue.peek().variant;
|
||||||
|
if (!isInWindowRange(refLoc, vc.getLocation())) { // Already saw all variant positions within cacheWindow distance ahead of vc (on its contig)
|
||||||
|
VariantContext phasedVc = this.phaseVariantAndRemove();
|
||||||
|
phasedList.add(phasedVc);
|
||||||
|
}
|
||||||
|
else { // refLoc is still not far enough ahead of vc
|
||||||
|
break; // since we ASSUME that the VCF is ordered by <contig,locus>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Pair<VariantContextStats, List<VariantContext>>(vcStats, phasedList);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Phase vc (head of siteQueue) using all VariantContext objects in the siteQueue that are
|
||||||
|
within cacheWindow distance ahead of vc (on its contig).
|
||||||
|
ASSUMES:
|
||||||
|
1. siteQueue is NOT empty.
|
||||||
|
2. All VariantContexts in siteQueue are in positions downstream of vc (head of queue).
|
||||||
|
*/
|
||||||
|
private VariantContext phaseVariantAndRemove() {
|
||||||
|
VariantContext vc = siteQueue.peek().variant;
|
||||||
|
|
||||||
|
ListIterator<VariantAndAlignment> windowIt = siteQueue.listIterator();
|
||||||
|
int toIndex = 0;
|
||||||
|
while (windowIt.hasNext()) {
|
||||||
|
if (isInWindowRange(vc, windowIt.next().variant)) {
|
||||||
|
toIndex++;
|
||||||
|
}
|
||||||
|
else { //moved past the relevant range used for phasing
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List<VariantAndAlignment> windowVcList = siteQueue.subList(0,toIndex);
|
||||||
|
|
||||||
|
//
|
||||||
|
if (true) {
|
||||||
|
out.println("Will phase vc = " + vc.getLocation());
|
||||||
|
ListIterator<VariantAndAlignment> windowVcIt = windowVcList.listIterator();
|
||||||
|
while (windowVcIt.hasNext()) {
|
||||||
|
VariantContext phaseInfoVc = windowVcIt.next().variant;
|
||||||
|
out.println("Using phaseInfoVc = " + phaseInfoVc.getLocation());
|
||||||
|
}
|
||||||
|
out.println("");
|
||||||
|
}
|
||||||
|
//
|
||||||
|
|
||||||
|
Map<String, Genotype> sampGenotypes = vc.getGenotypes();
|
||||||
|
Map<String, Genotype> phasedGtMap = new TreeMap<String, Genotype>();
|
||||||
|
|
||||||
|
for (Map.Entry<String, Genotype> entry : sampGenotypes.entrySet()) {
|
||||||
|
String samp = entry.getKey();
|
||||||
|
Genotype gt = entry.getValue();
|
||||||
|
|
||||||
|
if (gt.getPloidy() != 2) {
|
||||||
|
throw new StingException("Doesn't support phasing for ploidy that is not 2!");
|
||||||
|
}
|
||||||
|
Allele topAll = gt.getAllele(0);
|
||||||
|
Allele botAll = gt.getAllele(1);
|
||||||
|
|
||||||
|
ListIterator<VariantAndAlignment> windowVcIt = windowVcList.listIterator();
|
||||||
|
while (windowVcIt.hasNext()) {
|
||||||
|
VariantAndAlignment va = windowVcIt.next();
|
||||||
|
VariantContext phaseInfoVc = va.variant;
|
||||||
|
AlignmentContext phaseInfoContext = va.alignment;
|
||||||
|
|
||||||
|
ReadBackedPileup reads = null;
|
||||||
|
if (phaseInfoContext.hasBasePileup()) {
|
||||||
|
reads = phaseInfoContext.getBasePileup();
|
||||||
|
}
|
||||||
|
else if (phaseInfoContext.hasExtendedEventPileup()) {
|
||||||
|
reads = phaseInfoContext.getExtendedEventPileup();
|
||||||
|
}
|
||||||
|
if (reads != null) {
|
||||||
|
ReadBackedPileup sampleReads = null;
|
||||||
|
if (reads.getSamples().contains(samp)) {
|
||||||
|
// Update the phasing table based on the reads for this sample:
|
||||||
|
sampleReads = reads.getPileupForSample(samp);
|
||||||
|
for (PileupElement p : sampleReads) {
|
||||||
|
SAMRecord rd = p.getRead();
|
||||||
|
out.println("read = " + rd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Random rn = new Random();
|
||||||
|
boolean genotypesArePhased = (rn.nextDouble() > 0.5);
|
||||||
|
|
||||||
|
boolean swapChromosomes = (rn.nextDouble() > 0.5);
|
||||||
|
if (swapChromosomes) {
|
||||||
|
Allele tmp = topAll;
|
||||||
|
topAll = botAll;
|
||||||
|
botAll = tmp;
|
||||||
|
}
|
||||||
|
List<Allele> phasedAll = new ArrayList<Allele>();
|
||||||
|
phasedAll.add(0, topAll);
|
||||||
|
phasedAll.add(1, botAll);
|
||||||
|
|
||||||
|
Genotype phasedGt = new Genotype(gt.getSampleName(), phasedAll, gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), genotypesArePhased);
|
||||||
|
phasedGtMap.put(samp, phasedGt);
|
||||||
|
}
|
||||||
|
siteQueue.remove(); // remove vc from head of queue
|
||||||
|
|
||||||
|
return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes());
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInWindowRange(VariantContext vc1, VariantContext vc2) {
|
||||||
|
GenomeLoc loc1 = vc1.getLocation();
|
||||||
|
GenomeLoc loc2 = vc2.getLocation();
|
||||||
|
|
||||||
|
return isInWindowRange(loc1, loc2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInWindowRange(GenomeLoc loc1, GenomeLoc loc2) {
|
||||||
|
return (loc1.onSameContig(loc2) && loc1.distance(loc2) <= cacheWindow);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeVCF(VariantContext vc) {
|
||||||
|
if ( writer == null )
|
||||||
|
initializeVcfWriter(vc);
|
||||||
|
|
||||||
|
byte refBase;
|
||||||
|
if (!vc.isIndel()) {
|
||||||
|
Allele varAllele = vc.getReference();
|
||||||
|
refBase = varAllele.getBases()[0];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
refBase = vc.getReferenceBaseForIndel();
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.add(vc, refBase);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContextStats reduce(Pair<VariantContextStats, List<VariantContext>> statsAndList, VariantContextStats stats) {
|
||||||
|
Iterator<VariantContext> varContIter = statsAndList.second.iterator();
|
||||||
|
writeVarContIter(varContIter);
|
||||||
|
|
||||||
|
stats.addTo(statsAndList.first);
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase anything left in the cached siteQueue, and report the number of reads and VariantContexts processed.
|
||||||
|
*
|
||||||
|
* @param result the number of reads and VariantContexts seen.
|
||||||
|
*/
|
||||||
|
public void onTraversalDone(VariantContextStats result) {
|
||||||
|
List<VariantContext> finalList = new LinkedList<VariantContext>();
|
||||||
|
while (!siteQueue.isEmpty()) {
|
||||||
|
VariantContext phasedVc = this.phaseVariantAndRemove();
|
||||||
|
finalList.add(phasedVc);
|
||||||
|
}
|
||||||
|
writeVarContIter(finalList.iterator());
|
||||||
|
|
||||||
|
if ( writer != null )
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
out.println("Number of reads observed: " + result.getNumReads());
|
||||||
|
out.println("Number of variant sites observed: " + result.getNumVarSites());
|
||||||
|
out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites()));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void writeVarContIter(Iterator<VariantContext> varContIter) {
|
||||||
|
while (varContIter.hasNext()) {
|
||||||
|
VariantContext vc = varContIter.next();
|
||||||
|
writeVCF(vc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class VariantAndAlignment {
|
||||||
|
public VariantContext variant;
|
||||||
|
public AlignmentContext alignment;
|
||||||
|
|
||||||
|
public VariantAndAlignment(VariantContext variant, AlignmentContext alignment) {
|
||||||
|
this.variant = variant;
|
||||||
|
this.alignment = alignment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class VariantContextStats {
|
||||||
|
private int numReads;
|
||||||
|
private int numVarSites;
|
||||||
|
|
||||||
|
public VariantContextStats() {
|
||||||
|
this.numReads = 0;
|
||||||
|
this.numVarSites = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContextStats(int numReads, int numVarSites) {
|
||||||
|
this.numReads = numReads;
|
||||||
|
this.numVarSites = numVarSites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addTo(VariantContextStats other) {
|
||||||
|
this.numReads += other.numReads;
|
||||||
|
this.numVarSites += other.numVarSites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumReads() {return numReads;}
|
||||||
|
public int getNumVarSites() {return numVarSites;}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue