From 21ae3ef5f96e4b24bca21ea86d79dc7ed4b7f108 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 31 Dec 2011 13:56:41 -0500 Subject: [PATCH] Added downsampling support to ReduceReads * Downsampling is now a parameter to the walker with default value of 0 (no downsampling) * Downsampling selects reads at random at the variant region window and strives to achieve uniform coverage if possible around the desired downsampling value. * Added integration test --- .../sting/utils/sam/GATKSAMRecord.java | 6 + .../sting/utils/sam/ReadUtils.java | 113 ++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 5e0802fa6..913548ecc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -183,6 +183,12 @@ public class GATKSAMRecord extends BAMRecord { return getReducedReadCounts() != null; } + /** + * The number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @return the number of bases corresponding to the i'th base of the reduced read + */ public final byte getReducedCount(final int i) { byte firstCount = getReducedReadCounts()[0]; byte offsetCount = getReducedReadCounts()[i]; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index d52814ef7..7fa2f6230 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -531,4 +532,116 @@ public class ReadUtils { return new Pair(false, null); } + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; + coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i=0; i 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } }