From da6f2d3c9de060ddf31719378f718cb80514ea62 Mon Sep 17 00:00:00 2001 From: aaron Date: Wed, 13 Apr 2011 22:11:08 +0000 Subject: [PATCH] adding the capseg tools to the new walker repo git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5632 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/capseg/AlleleCountWalker.java | 135 +++++++++++ .../gatk/walkers/capseg/BaitDepthWalker.java | 224 ++++++++++++++++++ .../walkers/capseg/SampleToLaneWalker.java | 83 +++++++ 3 files changed, 442 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/capseg/AlleleCountWalker.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/capseg/BaitDepthWalker.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/capseg/SampleToLaneWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/capseg/AlleleCountWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/AlleleCountWalker.java new file mode 100755 index 000000000..d5618b5a3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/AlleleCountWalker.java @@ -0,0 +1,135 @@ +package org.broadinstitute.sting.gatk.walkers.capseg; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Random; +import java.util.Set; + +/** + * get the allele counts at loci that overlap both the bait list and DbSNP + */ +//@By(DataSource.REFERENCE_ORDERED_DATA) +public class AlleleCountWalker extends LocusWalker { + @Argument(doc = "our output file name", shortName = "fl") + File output; + + String dbTrack = "DbSNP"; + String callTrack = "calls"; + Random generator = new Random(); + private PrintWriter out; + + public void initialize() { + try { + out = new PrintWriter(new FileWriter(output)); + } catch (IOException e) { + throw new IllegalArgumentException("Uncaught exception!"); + } + out.println("contig,position,A,B,aBase,bBase,dbSNPOrientation"); + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + // check that the tracker and the context are not null, and we a variant, and some amount of bases + if (tracker == null || context == null || tracker.getAllVariantContexts(ref).size() < 1 || context.getBasePileup().size() == 0) + return 0; + + // get the het + VariantContext het = tracker.getVariantContext(ref, callTrack, context.getLocation()); + VariantContext dbSNP = tracker.getVariantContext(ref, dbTrack, context.getLocation()); + if (het == null || het.getHetCount() == 0) return 0; + + + byte[] bases = context.getBasePileup().getBases(); + int[] counts = new int[5]; + for (byte b : bases) { + if (b == 'a' || b == 'A') counts[0]++; + else if (b == 'c' || b == 'C') counts[1]++; + else if (b == 'g' || b == 'G') counts[2]++; + else if (b == 't' || b == 'T') counts[3]++; + else counts[4]++; + } + int aIndex = 0; + byte a = 'a'; + + // find the maximumvalue + for (int index = 0; index < counts.length; index++) + if (counts[index] > counts[aIndex]) { + aIndex = index; + a = (byte) ((index < 1) ? 'a' : (index < 2) ? 'c' : (index < 3) ? 'g' : 't'); + } + + int bIndex = (aIndex != 0) ? 0 : 1; + byte b = 'a'; + for (int index = 0; index < counts.length; index++) + if (counts[index] > counts[bIndex] && index != aIndex) { + bIndex = index; + b = (byte) ((index < 1) ? 'a' : (index < 2) ? 'c' : (index < 3) ? 'g' : 't'); + } + + boolean usedDbSNP = false; + // check if there is an ordering that we'd like to subscribe to + if (dbSNP != null) { + // get the alt from the DbSNP file + Set alts = dbSNP.getAlternateAlleles(); + + // if true, swap the two + if (alts.size() == 1 && alts.iterator().next().basesMatch(String.valueOf((char) a))) { + byte temp = a; + a = b; + b = temp; + + int tmp = aIndex; + aIndex = bIndex; + bIndex = tmp; + usedDbSNP = true; + } else if (alts.size() == 1 && alts.iterator().next().basesMatch(String.valueOf((char) b))) { + usedDbSNP = true; + } + } + + if (!usedDbSNP && generator.nextDouble() > 0.5) { + + byte temp = a; + a = b; + b = temp; + + int tmp = aIndex; + aIndex = bIndex; + bIndex = tmp; + } + if (counts[aIndex] == 0 && counts[bIndex] == 0) + return 0; + out.println(context.getLocation().getContig() + "," + + context.getLocation().getStart() + "," + counts[aIndex] + "," + counts[bIndex] + "," + (char) a + "," + (char) b + "," + usedDbSNP); + return 1; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraversalDone(Integer result) { + out.close(); + logger.info("[REDUCE RESULT] Traversal result is: " + result); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/capseg/BaitDepthWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/BaitDepthWalker.java new file mode 100755 index 000000000..4554d6a30 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/BaitDepthWalker.java @@ -0,0 +1,224 @@ +package org.broadinstitute.sting.gatk.walkers.capseg; + +import net.sf.samtools.SAMRecord; +import org.broad.tribble.bed.FullBEDFeature; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.PrintStream; +import java.util.*; + +/** + * This tool captures baits depth (as reads per bait) across all baits and + * lanes. The output is one line per bait, with columns for each of the + * lanes. Reads which overlap two or more baits are added to all baits that + * the read overlaps. + * + */ +public class BaitDepthWalker extends ReadWalker { + + // what sample we should assign to our unknown reads + private static final String UNKNOWN_SAMPLE_NAME = "unknown"; + + // what separator to use + private static final String SEPERATOR = ","; + + // the current output location + @Output(doc="File to which baits and coverage information for lanes should be written",required=true) + PrintStream out; + + String currentBait = null; + + // our current bait name + String baitName = null; + + // create a coverage mapping by readGroup + HashMap currentCoverage = new HashMap(); + + // set of all of then read groups + Set readGroups = new LinkedHashSet(); + + // set containing the baits and their positions, used for metrics during and after the run + HashMap baitMappings = new HashMap(); + + // a structure to hold reads that align to other baits (other than the one we're currently processing) + // baits should be contiguous, so reads should not be left in this queue once we've passed the appropriate bait + HashMap> recordsForOtherBaits = new HashMap>(); + + // a mapping of the read group to the bam + HashMap readGroupToBam = new HashMap(); + + /** + * get all of the read groups from the BAM headers, initializing our maps and arrays with this information + */ + @Override + public void initialize() { + int bamNumber = 0; + for (Set set : getToolkit().getMergedReadGroupsByReaders()) + for (String smp : set) { + readGroups.add(smp); + readGroupToBam.put(smp,String.valueOf(bamNumber)); + bamNumber++; + } + out.print("#bait,location"); + for (String st : readGroups) { + out.print(SEPERATOR); + out.print(st); + currentCoverage.put(st,0); + } + out.println(); + } + + /** + * get the read, determine which baits it falls into, and update our matrix + * @param ref the reference bases + * @param read the sequencer read + * @param metaDataTracker our metadata, used to pull in the BED file + * @return a read stat object, which tracks how many reads fell into baits, outside of baits, etc + */ + @Override + public ReadStats map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + // setup the return value + ReadStats stats = new ReadStats(); + + // get a list of the features + List features = metaDataTracker.getAllCoveringRods(); + if (features.size() == 0) { + stats.addRead(0); + } else { + LinkedHashSet coveringBaits = new LinkedHashSet(); + for (GATKFeature feature : features) { + if (feature.getUnderlyingObject() instanceof FullBEDFeature && + !((FullBEDFeature)feature.getUnderlyingObject()).getName().contains("ooster")) { + String name = ((FullBEDFeature)feature.getUnderlyingObject()).getName(); + coveringBaits.add(name); + if (!baitMappings.containsKey(name)) { + baitMappings.put(name,feature.getLocation()); + } + + } + } + if (coveringBaits.size() == 1) + addToBaits(coveringBaits.iterator().next(),read.getReadGroup().getReadGroupId()); + else { + List baits = new ArrayList(coveringBaits); + for (String bait : baits) + if (bait.equals(currentBait)) + addToBaits(bait,read.getReadGroup().getReadGroupId()); + else { + if (!recordsForOtherBaits.containsKey(bait)) + recordsForOtherBaits.put(bait,new ArrayList()); + recordsForOtherBaits.get(bait).add(read.getReadGroup().getReadGroupId()); + } + + } + stats.addRead(coveringBaits.size()); + } + return stats; + } + + /** + * add a read to a particular bait name + * @param baitName the bait name + * @param readGroup the read group (lane) the read came from + */ + public void addToBaits(String baitName, String readGroup) { + // if we haven't seen the bait name yet, create a hashmap for it and dump the currentContents to disk + if (currentBait == null || !currentBait.equals(baitName)) { + if (currentBait != null) { + out.print(currentBait + SEPERATOR); + currentBait = baitName; + out.print(baitMappings.get(baitName)); + for (String rg : readGroups) { + out.print(SEPERATOR + currentCoverage.get(rg)); + currentCoverage.put(rg,0); + } + + out.println(); + if (recordsForOtherBaits.containsKey(baitName)) { + ArrayList records = recordsForOtherBaits.get(baitName); + recordsForOtherBaits.remove(baitName); + for (String rec : records) + addToBaits(baitName,rec); + } + + } + currentBait = baitName; + + } + if (!readGroups.contains(readGroup)) + throw new IllegalArgumentException("Novel (unseen) sample name " + readGroup + " is not in the header of any of the BAM files"); + else + currentCoverage.put(readGroup, currentCoverage.get(readGroup) + 1); + + } + + @Override + public ReadStats reduceInit() { + return new ReadStats(); + } + + @Override + public ReadStats reduce(ReadStats value, ReadStats sum) { + return sum.add(value); + } + + /** + * emit the last bait, any remaining baits, and the read stats + * @param result the reads stats object + */ + public void onTraversalDone(ReadStats result) { + out.print(baitName + SEPERATOR); + out.print(baitMappings.get(baitName)); + for (String rg : this.readGroups) + out.print(SEPERATOR + currentCoverage.get(rg)); + out.println(); + /*for (Map.Entry> entry : recordsForOtherBaits.entrySet()) { + out.print(entry.getKey() + SEPERATOR); + out.print(baitMappings.get(baitName)); + for (String rg : this.readGroups) { + int index = entry.getValue().indexOf(rg); + ///out.print(SEPERATOR + (index < 0 ? 0 : entry.getValue().)); + } + out.println(); + }*/ + + currentBait = baitName; + currentCoverage.clear(); + out.flush(); + out.close(); + logger.info("[REDUCE RESULT] Traversal result is: " + result); + } + +} + +// a small helper class for storing read statistics +class ReadStats { + int numberInBait = 0; + int numberOutsideOfBait = 0; + int baitToReadCount = 0; // the number of baits we've seen, used to determine the bait/read ratio + int readCount = 0; + + public void addRead(int baitCount) { + baitToReadCount += baitCount; + numberInBait += (baitCount > 0) ? 1 : 0; + numberOutsideOfBait += (baitCount > 0) ? 0 : 1; + readCount++; + } + + public String toString() { + return "Number in baits = " + numberInBait + ", number outside of baits = " + numberOutsideOfBait + ", baits per read" + (baitToReadCount/readCount); + } + + public org.broadinstitute.sting.gatk.walkers.capseg.ReadStats add(org.broadinstitute.sting.gatk.walkers.capseg.ReadStats stat) { + this.numberInBait += stat.numberInBait; + this.numberOutsideOfBait += stat.numberOutsideOfBait; + this.baitToReadCount += stat.baitToReadCount; + this.readCount += stat.readCount; + return this; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/capseg/SampleToLaneWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/SampleToLaneWalker.java new file mode 100755 index 000000000..bd9cbaa27 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/capseg/SampleToLaneWalker.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.gatk.walkers.capseg; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.PrintWriter; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: aaron + * Date: 3/7/11 + * Time: 3:40 PM + * To change this template use File | Settings | File Templates. + */ +public class SampleToLaneWalker extends ReadWalker { + + @Argument(doc="Sample to read group file", shortName="strf", required=true) + BufferedOutputStream sampleToReadGroup; + + @Output(doc="BAM file to sample file", shortName="bs") + BufferedOutputStream bamToSample; + + public void initialize() { + PrintWriter sr_writer = new PrintWriter(sampleToReadGroup); + PrintWriter bs_writer = new PrintWriter(bamToSample); + + // write out the sample to read-group file + sr_writer.println("Sample\tReadGroup"); + SAMFileHeader header = getToolkit().getSAMFileHeader(); + for (SAMReadGroupRecord rec : header.getReadGroups()) { + Sample sample = getToolkit().getSampleByReadGroup(rec); + sr_writer.println(sample.getId() + "\t" + rec.getReadGroupId()); + } + sr_writer.flush(); + + // write out the bam file to the sample information + bs_writer.println("BAM\tSample"); + for (SAMReaderID rec : getToolkit().getReadsDataSource().getReaderIDs()) { + File fl = getToolkit().getSourceFileForReaderID(rec); + Iterator iter = getToolkit().getSAMFileHeader(rec).getReadGroups().iterator(); + Set names = new HashSet (); + while (iter.hasNext()) + names.add(iter.next().getSample()); + Iterator strs = names.iterator(); + String rg = ""; + if (strs.hasNext()) + rg = strs.next(); + while (strs.hasNext()) { + rg = rg + ";" + strs.next(); + } + bs_writer.println(fl.getName() + "\t" + rg); + } + bs_writer.flush(); + } + + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } +}