diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index eaf341fc3..f10fd3d02 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -4,6 +4,7 @@ import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; @@ -127,7 +128,7 @@ public abstract class LocusView extends LocusIterator implements View { // Find the next. seedNextLocus(); - if( sourceInfo.getDownsamplingMethod().toCoverage != null ) + if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); // if the current loci isn't null, get the overflow tracker and pass it to the alignment context diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java index 5782a43dd..25f76b05e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java @@ -44,7 +44,7 @@ import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileup * @version 0.1 */ public class SAMPileupCodec implements FeatureCodec { - // the number of tokens we expect to parse from a dbSNP line + // the number of tokens we expect to parse from a pileup line private static final int expectedTokenCount = 10; private static final char fldDelim = '\t'; @@ -59,8 +59,7 @@ public class SAMPileupCodec implements FeatureCodec { * Return the # of header lines for this file. * * @param reader the line reader - * @return 0 in this case, we assume no header lines. The DbSNP file may have a - * header line beginning with '#', but we can ignore that in the decode function. + * @return 0 in this case, we assume no header lines. */ public int readHeader(LineReader reader) { // we don't require a header line, but it may exist. We'll deal with that above. @@ -82,10 +81,9 @@ public class SAMPileupCodec implements FeatureCodec { "(expected = " + expectedTokenCount + ", saw = " + count + " on " + "line = " + line + ")"); - SAMPileupFeature feature = new SAMPileupFeature(tokens[0], - 0, - 0); + SAMPileupFeature feature = new SAMPileupFeature(); + feature.setChr(tokens[0]); feature.setStart(Integer.parseInt(tokens[1])); if(tokens[2].length() != 1) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java index 6b7257913..a794c2704 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java @@ -59,19 +59,9 @@ public class SAMPileupFeature implements Feature { private double variantScore = 0; /** - * create the dbSNP feature, given the following information: - * - * @param contig the contig rsID - * @param start the start position, one based - * @param stop the stop position, one based + * create the pileup feature. Default protection so that only other classes in this package can create it. */ - SAMPileupFeature(String contig, - int start, - int stop) { - this.contig = contig; - this.start = start; - this.stop = stop; - } + SAMPileupFeature() {} public String getChr() { return contig; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java new file mode 100644 index 000000000..e91230351 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.samread; + +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.ParsingUtils; +import org.broad.tribble.util.LineReader; +import net.sf.samtools.util.StringUtil; +import net.sf.samtools.TextCigarCodec; +import net.sf.samtools.Cigar; + +/** + * Decodes a simple SAM text string. + * + * @author mhanna + * @version 0.1 + */ +public class SAMReadCodec implements FeatureCodec { + /* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */ + + // the number of tokens we expect to parse from a read line + private static final int expectedTokenCount = 11; + + /** + * Return the # of header lines for this file. + * + * @param reader the line reader + * @return 0 in this case, we assume no header lines. The reads file may have a + * header line beginning with '@', but we can ignore that in the decode function. + */ + public int readHeader(LineReader reader) { + // we don't require a header line, but it may exist. We'll deal with that above. + return 0; + } + + /** + * Decode a single line in a SAM text file. + * @param line line to decode. + * @return A SAMReadFeature modeling that line. + */ + public SAMReadFeature decode(String line) { + // we may be asked to process a header line; ignore it + if (line.startsWith("@")) return null; + + String[] tokens = new String[expectedTokenCount]; + + // split the line + int count = ParsingUtils.splitWhitespace(line,tokens); + + // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) + if (count != expectedTokenCount) + throw new CodecLineParsingException("the SAM read line didn't have the expected number of tokens " + + "(expected = " + expectedTokenCount + ", saw = " + count + " on " + + "line = " + line + ")"); + + final String readName = tokens[0]; + final int flags = Integer.parseInt(tokens[1]); + final String contigName = tokens[2]; + final int alignmentStart = Integer.parseInt(tokens[3]); + final int mapQ = Integer.parseInt(tokens[4]); + final String cigarString = tokens[5]; + final String mateContigName = tokens[6]; + final int mateAlignmentStart = Integer.parseInt(tokens[7]); + final int inferredInsertSize = Integer.parseInt(tokens[8]); + final byte[] bases = StringUtil.stringToBytes(tokens[9]); + final byte[] qualities = StringUtil.stringToBytes(tokens[10]); + + // Infer the alignment end. + Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + int alignmentEnd = alignmentStart + cigar.getReferenceLength() - 1; + + // Remove printable character conversion from the qualities. + for(byte quality: qualities) quality -= 33; + + return new SAMReadFeature(readName, + flags, + contigName, + alignmentStart, + alignmentEnd, + mapQ, + cigarString, + mateContigName, + mateAlignmentStart, + inferredInsertSize, + bases, + qualities); + } + + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java new file mode 100644 index 000000000..7f12b2b2f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.samread; + +import org.broad.tribble.Feature; + +/** + * Represents a SAM record read from a SAM text format file. + * + * @author mhanna + * @version 0.1 + */ +public class SAMReadFeature implements Feature { + /** + * Name of this read. + */ + private final String readName; + + /** + * Flags associated with this read. + */ + private final int flags; + + /** + * Contig to which this read is aligned. + */ + private final String contig; + + /** + * Position on contig to which this read is aligned. + */ + private final int alignmentStart; + + /** + * Position on contig at which this alignment ends. + */ + private final int alignmentEnd; + + /** + * Mapping quality for the read. + */ + private final int mapQ; + + /** + * Cigar string matching read to reference. + */ + private final String cigarString; + + /** + * Contig to which this read's pair is aligned. + */ + private final String mateContig; + + /** + * Position in contig to which this read's pair is aligned. + */ + private final int mateAlignmentStart; + + /** + * Size between pairs. + */ + private final int insertSize; + + /** + * Bases in this read. + */ + private final byte[] bases; + + /** + * Qualities constituting this read. + */ + private final byte[] qualities; + + // Tags are not currently supported. + + /** + * create the read feature. Default protection so that only other classes in this package can create it. + */ + SAMReadFeature(final String readName, + final int flags, + final String contig, + final int alignmentStart, + final int alignmentEnd, + final int mapQ, + final String cigarString, + final String mateContig, + final int mateAlignmentStart, + final int insertSize, + final byte[] bases, + final byte[] qualities) { + this.readName = readName; + this.flags = flags; + this.contig = contig; + this.alignmentStart = alignmentStart; + this.alignmentEnd = alignmentEnd; + this.mapQ = mapQ; + this.cigarString = cigarString; + this.mateContig = mateContig; + this.mateAlignmentStart = mateAlignmentStart; + this.insertSize = insertSize; + this.bases = bases; + this.qualities = qualities; + } + + public String getReadName() { + return readName; + } + + public int getFlags() { + return flags; + } + + public String getReferenceName() { + return contig; + } + + public int getAlignmentStart() { + return alignmentStart; + } + + public int getAlignmentEnd() { + return alignmentEnd; + } + + /** + * An alias for getReferenceName, required by Feature interface. + * @return Aligned contig name. + */ + public String getChr() { + return getReferenceName(); + } + + /** + * An alias for getAlignmentEnd(), required by Feature interface. + * @return End of alignment, inclusive. + */ + public int getStart() { + return getAlignmentStart(); + } + + /** + * An alias for getAlignmentStart(), required by Feature interface. + * @return Aligned position. 1-based. + */ + public int getEnd() { + return getAlignmentEnd(); + } + + public int getMappingQuality() { + return mapQ; + } + + public String getCigarString() { + return cigarString; + } + + public String getMateReferenceName() { + return mateContig; + } + + public int getMateAlignmentStart() { + return mateAlignmentStart; + } + + public int getInferredInsertSize() { + return insertSize; + } + + public byte[] getReadBases() { + return bases; + } + + public byte[] getReadQualities() { + return qualities; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java new file mode 100644 index 000000000..50c5b2afd --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DownsamplingValidationWalker.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.samread.SAMReadFeature; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.commandline.Argument; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Arrays; + +import net.sf.samtools.SAMRecord; + +/** + * Checks a given downsampled pileup against the full pileup to ensure that the downsampled pileup could + * possibly be a valid version of the full pileup. + * + * @author mhanna + * @version 0.1 + */ +public class DownsamplingValidationWalker extends LocusWalker { + @Argument(fullName="max_expected_number_of_reads",shortName="menr",doc="The expected number of reads chosed by the downsampler. Fewer than this number might be added to a given alignment start, but more than this should never be.",required=true) + private int maxExpectedNumberOfReads = 0; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + Collection allFeatures = tracker.getReferenceMetaData("reads"); + + Collection unsampledReadsStartingAtThisLocus = new ArrayList(); + for(Object featureCandidate: allFeatures) { + if(featureCandidate instanceof SAMReadFeature) { + SAMReadFeature feature = (SAMReadFeature)featureCandidate; + if(feature.getReferenceName().equals(ref.getLocus().getContig()) && feature.getAlignmentStart() == ref.getLocus().getStart()) + unsampledReadsStartingAtThisLocus.add(feature); + } + } + Collection sampledReadsStartingAtThisLocus = new ArrayList(); + for(SAMRecord read: pileup.getReads()) { + if(read.getReferenceName().equals(ref.getLocus().getContig()) && read.getAlignmentStart() == ref.getLocus().getStart()) + sampledReadsStartingAtThisLocus.add(read); + } + + int matchingReadsFound = 0; + if(unsampledReadsStartingAtThisLocus.isEmpty()) { + if(!sampledReadsStartingAtThisLocus.isEmpty()) + throw new StingException("Downsampler hallucinated a read starting at locus "+ref.getLocus()); + } + else { + boolean foundMatch = false; + for(SAMReadFeature unsampledRead: unsampledReadsStartingAtThisLocus) { + for(SAMRecord sampledRead: sampledReadsStartingAtThisLocus) { + if(unsampledRead.getReadName().equals(sampledRead.getReadName()) && + Arrays.equals(unsampledRead.getReadBases(),sampledRead.getReadBases())) { + foundMatch = true; + matchingReadsFound++; + } + } + } + + if(!foundMatch) + throw new StingException("Downsampler failed to include any read starting at locus "+ref.getLocus()); + + if(matchingReadsFound > maxExpectedNumberOfReads) + throw new StingException("Downsampler found too many reads starting at locus "+ref.getLocus()); + } + + return matchingReadsFound; + } + + // Given result of map function + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + public Long treeReduce(Long lhs, Long rhs ) { + return lhs+rhs; + } +}