A downsampling validator. Compares the generated pileup passed in from the alignment context to the reads,

passed in as a Tribble SAM text feature.  If the generated pileup contains a valid set of reads according to
the downsampling rules, the test passes.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3421 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-05-23 21:49:54 +00:00
parent e9ee55d7dd
commit a40e64e47b
6 changed files with 426 additions and 19 deletions

View File

@ -4,6 +4,7 @@ import net.sf.picard.filter.FilteringIterator;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
@ -127,7 +128,7 @@ public abstract class LocusView extends LocusIterator implements View {
// Find the next.
seedNextLocus();
if( sourceInfo.getDownsamplingMethod().toCoverage != null )
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
// if the current loci isn't null, get the overflow tracker and pass it to the alignment context

View File

@ -44,7 +44,7 @@ import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileup
* @version 0.1
*/
public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
// the number of tokens we expect to parse from a dbSNP line
// the number of tokens we expect to parse from a pileup line
private static final int expectedTokenCount = 10;
private static final char fldDelim = '\t';
@ -59,8 +59,7 @@ public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
* Return the # of header lines for this file.
*
* @param reader the line reader
* @return 0 in this case, we assume no header lines. The DbSNP file may have a
* header line beginning with '#', but we can ignore that in the decode function.
* @return 0 in this case, we assume no header lines.
*/
public int readHeader(LineReader reader) {
// we don't require a header line, but it may exist. We'll deal with that above.
@ -82,10 +81,9 @@ public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
"line = " + line + ")");
SAMPileupFeature feature = new SAMPileupFeature(tokens[0],
0,
0);
SAMPileupFeature feature = new SAMPileupFeature();
feature.setChr(tokens[0]);
feature.setStart(Integer.parseInt(tokens[1]));
if(tokens[2].length() != 1)

View File

@ -59,19 +59,9 @@ public class SAMPileupFeature implements Feature {
private double variantScore = 0;
/**
* create the dbSNP feature, given the following information:
*
* @param contig the contig rsID
* @param start the start position, one based
* @param stop the stop position, one based
* create the pileup feature. Default protection so that only other classes in this package can create it.
*/
SAMPileupFeature(String contig,
int start,
int stop) {
this.contig = contig;
this.start = start;
this.stop = stop;
}
SAMPileupFeature() {}
public String getChr() {
return contig;

View File

@ -0,0 +1,113 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.features.samread;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.util.LineReader;
import net.sf.samtools.util.StringUtil;
import net.sf.samtools.TextCigarCodec;
import net.sf.samtools.Cigar;
/**
* Decodes a simple SAM text string.
*
* @author mhanna
* @version 0.1
*/
public class SAMReadCodec implements FeatureCodec<SAMReadFeature> {
/* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */
// the number of tokens we expect to parse from a read line
private static final int expectedTokenCount = 11;
/**
* Return the # of header lines for this file.
*
* @param reader the line reader
* @return 0 in this case, we assume no header lines. The reads file may have a
* header line beginning with '@', but we can ignore that in the decode function.
*/
public int readHeader(LineReader reader) {
// we don't require a header line, but it may exist. We'll deal with that above.
return 0;
}
/**
* Decode a single line in a SAM text file.
* @param line line to decode.
* @return A SAMReadFeature modeling that line.
*/
public SAMReadFeature decode(String line) {
// we may be asked to process a header line; ignore it
if (line.startsWith("@")) return null;
String[] tokens = new String[expectedTokenCount];
// split the line
int count = ParsingUtils.splitWhitespace(line,tokens);
// check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
if (count != expectedTokenCount)
throw new CodecLineParsingException("the SAM read line didn't have the expected number of tokens " +
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
"line = " + line + ")");
final String readName = tokens[0];
final int flags = Integer.parseInt(tokens[1]);
final String contigName = tokens[2];
final int alignmentStart = Integer.parseInt(tokens[3]);
final int mapQ = Integer.parseInt(tokens[4]);
final String cigarString = tokens[5];
final String mateContigName = tokens[6];
final int mateAlignmentStart = Integer.parseInt(tokens[7]);
final int inferredInsertSize = Integer.parseInt(tokens[8]);
final byte[] bases = StringUtil.stringToBytes(tokens[9]);
final byte[] qualities = StringUtil.stringToBytes(tokens[10]);
// Infer the alignment end.
Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString);
int alignmentEnd = alignmentStart + cigar.getReferenceLength() - 1;
// Remove printable character conversion from the qualities.
for(byte quality: qualities) quality -= 33;
return new SAMReadFeature(readName,
flags,
contigName,
alignmentStart,
alignmentEnd,
mapQ,
cigarString,
mateContigName,
mateAlignmentStart,
inferredInsertSize,
bases,
qualities);
}
}

View File

@ -0,0 +1,198 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.features.samread;
import org.broad.tribble.Feature;
/**
* Represents a SAM record read from a SAM text format file.
*
* @author mhanna
* @version 0.1
*/
public class SAMReadFeature implements Feature {
/**
* Name of this read.
*/
private final String readName;
/**
* Flags associated with this read.
*/
private final int flags;
/**
* Contig to which this read is aligned.
*/
private final String contig;
/**
* Position on contig to which this read is aligned.
*/
private final int alignmentStart;
/**
* Position on contig at which this alignment ends.
*/
private final int alignmentEnd;
/**
* Mapping quality for the read.
*/
private final int mapQ;
/**
* Cigar string matching read to reference.
*/
private final String cigarString;
/**
* Contig to which this read's pair is aligned.
*/
private final String mateContig;
/**
* Position in contig to which this read's pair is aligned.
*/
private final int mateAlignmentStart;
/**
* Size between pairs.
*/
private final int insertSize;
/**
* Bases in this read.
*/
private final byte[] bases;
/**
* Qualities constituting this read.
*/
private final byte[] qualities;
// Tags are not currently supported.
/**
* create the read feature. Default protection so that only other classes in this package can create it.
*/
SAMReadFeature(final String readName,
final int flags,
final String contig,
final int alignmentStart,
final int alignmentEnd,
final int mapQ,
final String cigarString,
final String mateContig,
final int mateAlignmentStart,
final int insertSize,
final byte[] bases,
final byte[] qualities) {
this.readName = readName;
this.flags = flags;
this.contig = contig;
this.alignmentStart = alignmentStart;
this.alignmentEnd = alignmentEnd;
this.mapQ = mapQ;
this.cigarString = cigarString;
this.mateContig = mateContig;
this.mateAlignmentStart = mateAlignmentStart;
this.insertSize = insertSize;
this.bases = bases;
this.qualities = qualities;
}
public String getReadName() {
return readName;
}
public int getFlags() {
return flags;
}
public String getReferenceName() {
return contig;
}
public int getAlignmentStart() {
return alignmentStart;
}
public int getAlignmentEnd() {
return alignmentEnd;
}
/**
* An alias for getReferenceName, required by Feature interface.
* @return Aligned contig name.
*/
public String getChr() {
return getReferenceName();
}
/**
* An alias for getAlignmentEnd(), required by Feature interface.
* @return End of alignment, inclusive.
*/
public int getStart() {
return getAlignmentStart();
}
/**
* An alias for getAlignmentStart(), required by Feature interface.
* @return Aligned position. 1-based.
*/
public int getEnd() {
return getAlignmentEnd();
}
public int getMappingQuality() {
return mapQ;
}
public String getCigarString() {
return cigarString;
}
public String getMateReferenceName() {
return mateContig;
}
public int getMateAlignmentStart() {
return mateAlignmentStart;
}
public int getInferredInsertSize() {
return insertSize;
}
public byte[] getReadBases() {
return bases;
}
public byte[] getReadQualities() {
return qualities;
}
}

View File

@ -0,0 +1,107 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.samread.SAMReadFeature;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Arrays;
import net.sf.samtools.SAMRecord;
/**
* Checks a given downsampled pileup against the full pileup to ensure that the downsampled pileup could
* possibly be a valid version of the full pileup.
*
* @author mhanna
* @version 0.1
*/
public class DownsamplingValidationWalker extends LocusWalker<Integer,Long> {
@Argument(fullName="max_expected_number_of_reads",shortName="menr",doc="The expected number of reads chosed by the downsampler. Fewer than this number might be added to a given alignment start, but more than this should never be.",required=true)
private int maxExpectedNumberOfReads = 0;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getBasePileup();
Collection<Object> allFeatures = tracker.getReferenceMetaData("reads");
Collection<SAMReadFeature> unsampledReadsStartingAtThisLocus = new ArrayList<SAMReadFeature>();
for(Object featureCandidate: allFeatures) {
if(featureCandidate instanceof SAMReadFeature) {
SAMReadFeature feature = (SAMReadFeature)featureCandidate;
if(feature.getReferenceName().equals(ref.getLocus().getContig()) && feature.getAlignmentStart() == ref.getLocus().getStart())
unsampledReadsStartingAtThisLocus.add(feature);
}
}
Collection<SAMRecord> sampledReadsStartingAtThisLocus = new ArrayList<SAMRecord>();
for(SAMRecord read: pileup.getReads()) {
if(read.getReferenceName().equals(ref.getLocus().getContig()) && read.getAlignmentStart() == ref.getLocus().getStart())
sampledReadsStartingAtThisLocus.add(read);
}
int matchingReadsFound = 0;
if(unsampledReadsStartingAtThisLocus.isEmpty()) {
if(!sampledReadsStartingAtThisLocus.isEmpty())
throw new StingException("Downsampler hallucinated a read starting at locus "+ref.getLocus());
}
else {
boolean foundMatch = false;
for(SAMReadFeature unsampledRead: unsampledReadsStartingAtThisLocus) {
for(SAMRecord sampledRead: sampledReadsStartingAtThisLocus) {
if(unsampledRead.getReadName().equals(sampledRead.getReadName()) &&
Arrays.equals(unsampledRead.getReadBases(),sampledRead.getReadBases())) {
foundMatch = true;
matchingReadsFound++;
}
}
}
if(!foundMatch)
throw new StingException("Downsampler failed to include any read starting at locus "+ref.getLocus());
if(matchingReadsFound > maxExpectedNumberOfReads)
throw new StingException("Downsampler found too many reads starting at locus "+ref.getLocus());
}
return matchingReadsFound;
}
// Given result of map function
public Long reduceInit() { return 0L; }
public Long reduce(Integer value, Long sum) {
return value + sum;
}
public Long treeReduce(Long lhs, Long rhs ) {
return lhs+rhs;
}
}