A downsampling validator. Compares the generated pileup passed in from the alignment context to the reads,
passed in as a Tribble SAM text feature. If the generated pileup contains a valid set of reads according to the downsampling rules, the test passes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3421 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e9ee55d7dd
commit
a40e64e47b
|
|
@ -4,6 +4,7 @@ import net.sf.picard.filter.FilteringIterator;
|
|||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
|
|
@ -127,7 +128,7 @@ public abstract class LocusView extends LocusIterator implements View {
|
|||
|
||||
// Find the next.
|
||||
seedNextLocus();
|
||||
if( sourceInfo.getDownsamplingMethod().toCoverage != null )
|
||||
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
|
||||
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
|
||||
|
||||
// if the current loci isn't null, get the overflow tracker and pass it to the alignment context
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileup
|
|||
* @version 0.1
|
||||
*/
|
||||
public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
|
||||
// the number of tokens we expect to parse from a dbSNP line
|
||||
// the number of tokens we expect to parse from a pileup line
|
||||
private static final int expectedTokenCount = 10;
|
||||
private static final char fldDelim = '\t';
|
||||
|
||||
|
|
@ -59,8 +59,7 @@ public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
|
|||
* Return the # of header lines for this file.
|
||||
*
|
||||
* @param reader the line reader
|
||||
* @return 0 in this case, we assume no header lines. The DbSNP file may have a
|
||||
* header line beginning with '#', but we can ignore that in the decode function.
|
||||
* @return 0 in this case, we assume no header lines.
|
||||
*/
|
||||
public int readHeader(LineReader reader) {
|
||||
// we don't require a header line, but it may exist. We'll deal with that above.
|
||||
|
|
@ -82,10 +81,9 @@ public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
|
|||
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
|
||||
"line = " + line + ")");
|
||||
|
||||
SAMPileupFeature feature = new SAMPileupFeature(tokens[0],
|
||||
0,
|
||||
0);
|
||||
SAMPileupFeature feature = new SAMPileupFeature();
|
||||
|
||||
feature.setChr(tokens[0]);
|
||||
feature.setStart(Integer.parseInt(tokens[1]));
|
||||
|
||||
if(tokens[2].length() != 1)
|
||||
|
|
|
|||
|
|
@ -59,19 +59,9 @@ public class SAMPileupFeature implements Feature {
|
|||
private double variantScore = 0;
|
||||
|
||||
/**
|
||||
* create the dbSNP feature, given the following information:
|
||||
*
|
||||
* @param contig the contig rsID
|
||||
* @param start the start position, one based
|
||||
* @param stop the stop position, one based
|
||||
* create the pileup feature. Default protection so that only other classes in this package can create it.
|
||||
*/
|
||||
SAMPileupFeature(String contig,
|
||||
int start,
|
||||
int stop) {
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
}
|
||||
SAMPileupFeature() {}
|
||||
|
||||
public String getChr() {
|
||||
return contig;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.features.samread;
|
||||
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.samtools.TextCigarCodec;
|
||||
import net.sf.samtools.Cigar;
|
||||
|
||||
/**
|
||||
* Decodes a simple SAM text string.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SAMReadCodec implements FeatureCodec<SAMReadFeature> {
|
||||
/* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */
|
||||
|
||||
// the number of tokens we expect to parse from a read line
|
||||
private static final int expectedTokenCount = 11;
|
||||
|
||||
/**
|
||||
* Return the # of header lines for this file.
|
||||
*
|
||||
* @param reader the line reader
|
||||
* @return 0 in this case, we assume no header lines. The reads file may have a
|
||||
* header line beginning with '@', but we can ignore that in the decode function.
|
||||
*/
|
||||
public int readHeader(LineReader reader) {
|
||||
// we don't require a header line, but it may exist. We'll deal with that above.
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a single line in a SAM text file.
|
||||
* @param line line to decode.
|
||||
* @return A SAMReadFeature modeling that line.
|
||||
*/
|
||||
public SAMReadFeature decode(String line) {
|
||||
// we may be asked to process a header line; ignore it
|
||||
if (line.startsWith("@")) return null;
|
||||
|
||||
String[] tokens = new String[expectedTokenCount];
|
||||
|
||||
// split the line
|
||||
int count = ParsingUtils.splitWhitespace(line,tokens);
|
||||
|
||||
// check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
|
||||
if (count != expectedTokenCount)
|
||||
throw new CodecLineParsingException("the SAM read line didn't have the expected number of tokens " +
|
||||
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
|
||||
"line = " + line + ")");
|
||||
|
||||
final String readName = tokens[0];
|
||||
final int flags = Integer.parseInt(tokens[1]);
|
||||
final String contigName = tokens[2];
|
||||
final int alignmentStart = Integer.parseInt(tokens[3]);
|
||||
final int mapQ = Integer.parseInt(tokens[4]);
|
||||
final String cigarString = tokens[5];
|
||||
final String mateContigName = tokens[6];
|
||||
final int mateAlignmentStart = Integer.parseInt(tokens[7]);
|
||||
final int inferredInsertSize = Integer.parseInt(tokens[8]);
|
||||
final byte[] bases = StringUtil.stringToBytes(tokens[9]);
|
||||
final byte[] qualities = StringUtil.stringToBytes(tokens[10]);
|
||||
|
||||
// Infer the alignment end.
|
||||
Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString);
|
||||
int alignmentEnd = alignmentStart + cigar.getReferenceLength() - 1;
|
||||
|
||||
// Remove printable character conversion from the qualities.
|
||||
for(byte quality: qualities) quality -= 33;
|
||||
|
||||
return new SAMReadFeature(readName,
|
||||
flags,
|
||||
contigName,
|
||||
alignmentStart,
|
||||
alignmentEnd,
|
||||
mapQ,
|
||||
cigarString,
|
||||
mateContigName,
|
||||
mateAlignmentStart,
|
||||
inferredInsertSize,
|
||||
bases,
|
||||
qualities);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.features.samread;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
|
||||
/**
|
||||
* Represents a SAM record read from a SAM text format file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SAMReadFeature implements Feature {
|
||||
/**
|
||||
* Name of this read.
|
||||
*/
|
||||
private final String readName;
|
||||
|
||||
/**
|
||||
* Flags associated with this read.
|
||||
*/
|
||||
private final int flags;
|
||||
|
||||
/**
|
||||
* Contig to which this read is aligned.
|
||||
*/
|
||||
private final String contig;
|
||||
|
||||
/**
|
||||
* Position on contig to which this read is aligned.
|
||||
*/
|
||||
private final int alignmentStart;
|
||||
|
||||
/**
|
||||
* Position on contig at which this alignment ends.
|
||||
*/
|
||||
private final int alignmentEnd;
|
||||
|
||||
/**
|
||||
* Mapping quality for the read.
|
||||
*/
|
||||
private final int mapQ;
|
||||
|
||||
/**
|
||||
* Cigar string matching read to reference.
|
||||
*/
|
||||
private final String cigarString;
|
||||
|
||||
/**
|
||||
* Contig to which this read's pair is aligned.
|
||||
*/
|
||||
private final String mateContig;
|
||||
|
||||
/**
|
||||
* Position in contig to which this read's pair is aligned.
|
||||
*/
|
||||
private final int mateAlignmentStart;
|
||||
|
||||
/**
|
||||
* Size between pairs.
|
||||
*/
|
||||
private final int insertSize;
|
||||
|
||||
/**
|
||||
* Bases in this read.
|
||||
*/
|
||||
private final byte[] bases;
|
||||
|
||||
/**
|
||||
* Qualities constituting this read.
|
||||
*/
|
||||
private final byte[] qualities;
|
||||
|
||||
// Tags are not currently supported.
|
||||
|
||||
/**
|
||||
* create the read feature. Default protection so that only other classes in this package can create it.
|
||||
*/
|
||||
SAMReadFeature(final String readName,
|
||||
final int flags,
|
||||
final String contig,
|
||||
final int alignmentStart,
|
||||
final int alignmentEnd,
|
||||
final int mapQ,
|
||||
final String cigarString,
|
||||
final String mateContig,
|
||||
final int mateAlignmentStart,
|
||||
final int insertSize,
|
||||
final byte[] bases,
|
||||
final byte[] qualities) {
|
||||
this.readName = readName;
|
||||
this.flags = flags;
|
||||
this.contig = contig;
|
||||
this.alignmentStart = alignmentStart;
|
||||
this.alignmentEnd = alignmentEnd;
|
||||
this.mapQ = mapQ;
|
||||
this.cigarString = cigarString;
|
||||
this.mateContig = mateContig;
|
||||
this.mateAlignmentStart = mateAlignmentStart;
|
||||
this.insertSize = insertSize;
|
||||
this.bases = bases;
|
||||
this.qualities = qualities;
|
||||
}
|
||||
|
||||
public String getReadName() {
|
||||
return readName;
|
||||
}
|
||||
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public String getReferenceName() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
public int getAlignmentStart() {
|
||||
return alignmentStart;
|
||||
}
|
||||
|
||||
public int getAlignmentEnd() {
|
||||
return alignmentEnd;
|
||||
}
|
||||
|
||||
/**
|
||||
* An alias for getReferenceName, required by Feature interface.
|
||||
* @return Aligned contig name.
|
||||
*/
|
||||
public String getChr() {
|
||||
return getReferenceName();
|
||||
}
|
||||
|
||||
/**
|
||||
* An alias for getAlignmentEnd(), required by Feature interface.
|
||||
* @return End of alignment, inclusive.
|
||||
*/
|
||||
public int getStart() {
|
||||
return getAlignmentStart();
|
||||
}
|
||||
|
||||
/**
|
||||
* An alias for getAlignmentStart(), required by Feature interface.
|
||||
* @return Aligned position. 1-based.
|
||||
*/
|
||||
public int getEnd() {
|
||||
return getAlignmentEnd();
|
||||
}
|
||||
|
||||
public int getMappingQuality() {
|
||||
return mapQ;
|
||||
}
|
||||
|
||||
public String getCigarString() {
|
||||
return cigarString;
|
||||
}
|
||||
|
||||
public String getMateReferenceName() {
|
||||
return mateContig;
|
||||
}
|
||||
|
||||
public int getMateAlignmentStart() {
|
||||
return mateAlignmentStart;
|
||||
}
|
||||
|
||||
public int getInferredInsertSize() {
|
||||
return insertSize;
|
||||
}
|
||||
|
||||
public byte[] getReadBases() {
|
||||
return bases;
|
||||
}
|
||||
|
||||
public byte[] getReadQualities() {
|
||||
return qualities;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.samread.SAMReadFeature;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Arrays;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Checks a given downsampled pileup against the full pileup to ensure that the downsampled pileup could
|
||||
* possibly be a valid version of the full pileup.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class DownsamplingValidationWalker extends LocusWalker<Integer,Long> {
|
||||
@Argument(fullName="max_expected_number_of_reads",shortName="menr",doc="The expected number of reads chosed by the downsampler. Fewer than this number might be added to a given alignment start, but more than this should never be.",required=true)
|
||||
private int maxExpectedNumberOfReads = 0;
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
Collection<Object> allFeatures = tracker.getReferenceMetaData("reads");
|
||||
|
||||
Collection<SAMReadFeature> unsampledReadsStartingAtThisLocus = new ArrayList<SAMReadFeature>();
|
||||
for(Object featureCandidate: allFeatures) {
|
||||
if(featureCandidate instanceof SAMReadFeature) {
|
||||
SAMReadFeature feature = (SAMReadFeature)featureCandidate;
|
||||
if(feature.getReferenceName().equals(ref.getLocus().getContig()) && feature.getAlignmentStart() == ref.getLocus().getStart())
|
||||
unsampledReadsStartingAtThisLocus.add(feature);
|
||||
}
|
||||
}
|
||||
Collection<SAMRecord> sampledReadsStartingAtThisLocus = new ArrayList<SAMRecord>();
|
||||
for(SAMRecord read: pileup.getReads()) {
|
||||
if(read.getReferenceName().equals(ref.getLocus().getContig()) && read.getAlignmentStart() == ref.getLocus().getStart())
|
||||
sampledReadsStartingAtThisLocus.add(read);
|
||||
}
|
||||
|
||||
int matchingReadsFound = 0;
|
||||
if(unsampledReadsStartingAtThisLocus.isEmpty()) {
|
||||
if(!sampledReadsStartingAtThisLocus.isEmpty())
|
||||
throw new StingException("Downsampler hallucinated a read starting at locus "+ref.getLocus());
|
||||
}
|
||||
else {
|
||||
boolean foundMatch = false;
|
||||
for(SAMReadFeature unsampledRead: unsampledReadsStartingAtThisLocus) {
|
||||
for(SAMRecord sampledRead: sampledReadsStartingAtThisLocus) {
|
||||
if(unsampledRead.getReadName().equals(sampledRead.getReadName()) &&
|
||||
Arrays.equals(unsampledRead.getReadBases(),sampledRead.getReadBases())) {
|
||||
foundMatch = true;
|
||||
matchingReadsFound++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundMatch)
|
||||
throw new StingException("Downsampler failed to include any read starting at locus "+ref.getLocus());
|
||||
|
||||
if(matchingReadsFound > maxExpectedNumberOfReads)
|
||||
throw new StingException("Downsampler found too many reads starting at locus "+ref.getLocus());
|
||||
}
|
||||
|
||||
return matchingReadsFound;
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Long reduceInit() { return 0L; }
|
||||
public Long reduce(Integer value, Long sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public Long treeReduce(Long lhs, Long rhs ) {
|
||||
return lhs+rhs;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue